Merge remote-tracking branch 'nccl/master' into develop

This commit is contained in:
Wenkai Du
2023-04-25 15:38:04 -07:00
79 changed files with 4609 additions and 2880 deletions
+4 -2
View File
@@ -210,9 +210,9 @@ set(HEADER_SOURCES
src/include/param.h
src/include/channel.h
src/include/nvtx_stub.h
src/include/nvtx3.hpp
src/include/core.h
src/include/info.h
src/include/ipcsocket.h
src/include/git_version.h
src/include/npkit/npkit_event.h
src/include/npkit/npkit.h
@@ -301,6 +301,7 @@ set(CC_SOURCES
src/misc/socket.cc
src/misc/param.cc
src/misc/rocmwrap.cc
src/misc/ipcsocket.cc
src/misc/strongstream.cc
src/misc/msccl/msccl_lifecycle.cc
src/misc/msccl/msccl_parser.cc
@@ -463,11 +464,12 @@ foreach(target ${AMDGPU_TARGETS})
target_link_libraries(rccl PRIVATE --amdgpu-target=${target})
endforeach()
set(ENABLE_IFC 1 CACHE BOOL "Enable indirect function call")
if("${HIP_COMPILER}" MATCHES "clang")
find_program( hipcc_executable hipcc )
execute_process(COMMAND bash "-c" "${hipcc_executable} --version | grep 'HIP version' | awk -F\" \" '{ printf $3}' | awk -F\"-\" '{ printf $1}'" OUTPUT_VARIABLE hipcc_version_string)
message(STATUS "hipcc version: ${hipcc_version_string}")
if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201")
if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201" AND ENABLE_IFC)
add_definitions(-DUSE_INDIRECT_FUNCTION_CALL)
target_compile_options(rccl PRIVATE -fvisibility=hidden)
message(STATUS "Indirect function call enabled")
+2 -2
View File
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 16
NCCL_PATCH := 5
NCCL_MINOR := 17
NCCL_PATCH := 1
NCCL_SUFFIX :=
PKG_REVISION := 1
+3 -2
View File
@@ -12,7 +12,8 @@ INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
misc/ipcsocket.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \
collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
@@ -62,7 +63,7 @@ ALWAYS_REBUILD:
-include $(DEPFILES)
$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
$(INCDIR)/nccl.h : nccl.h.in
$(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk
# NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z))
@$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
mkdir -p $(INCDIR)
+18
View File
@@ -394,6 +394,24 @@ ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank,
return ncclSuccess;
}
// IntraNode in-place Broadcast
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size) {
if (nranks == 1) return ncclSuccess;
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - ENTER", rank, nranks, root, size);
if (rank == root) {
for (int i=0; i<nranks; i++) {
if (i != root) NCCLCHECK(bootstrapSend(commState, ranks[i], /*tag=*/ranks[i], bcastData, size));
}
}
else {
NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/rank, bcastData, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size);
return ncclSuccess;
}
ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
// New unex
struct unexConn* unex;
+4 -3
View File
@@ -13,14 +13,15 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
if (channel->id != -1) return ncclSuccess;
int nRanks = comm->nRanks;
int nPeers = nRanks + 1 /* Collnet */ + comm->localRanks /* NVLS */;
channel->id = channelId;
channel->workFifoSent = 0;
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
// The extra on nRanks+1 is for collnet root (i.e. network)
channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nRanks+1);
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nRanks+1, comm->deviceStream.cudaStream));
channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nPeers);
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, comm->deviceStream.cudaStream));
ncclCommPushCudaFree(comm, channel->devPeers);
channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
@@ -29,7 +30,7 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
for (int r=0; r < nRanks+1; ++r) {
for (int r=0; r < nPeers; ++r) {
for (int b=0; b < NCCL_MAX_CONNS; b++) {
channel->peers[r].send[b].comm = comm;
channel->peers[r].recv[b].comm = comm;
+42
View File
@@ -112,3 +112,45 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL
runRing<T, RedOp, ProtoLL128>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*chunkSize;
const int nThreadsGather = 128;
const int nThreadsBcast = 384 + WARP_SIZE;
const int tidEndGather = nThreadsGather;
const int tidEndBcast = tidEndGather + nThreadsBcast;
using Proto = ProtoSimple<1, 1>;
if (tid < tidEndGather) {
// Gather
int group = (0*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0);
}
} else if (tid < tidEndBcast) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
// Bcast through MC
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.send(offset, nelem);
}
}
}
};
+63 -4
View File
@@ -358,7 +358,7 @@ namespace {
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclTree *tree = (args->pad_0 == 2) ? &ncclShmem.channel.binTree : &ncclShmem.channel.tree;
ncclTree *tree = &ncclShmem.channel.tree;
ssize_t chunkSize = int(
Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize
: Proto::calcBytePerStep()/sizeof(T));
@@ -583,9 +583,9 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
if (args->regUsed) {
prims.directScatter(offset, nelem, chunkSize, direct->headRank, direct->shift);
prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
} else {
prims.scatter(offset, nelem, chunkSize, direct->headRank, direct->shift);
prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
}
}
} else if (tid >= tidStartReduce && direct->out != -1) {
@@ -621,7 +621,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
prims.directGather(offset, nelem, chunkSize, direct->headRank, direct->shift);
prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
}
} else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
int group = (1*Proto::MaxGroupWidth) | (0<<16);
@@ -648,6 +648,65 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
#if NCCL_NVLS_ENABLED
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
const int nranks = ncclShmem.comm.nRanks;
const int reduceWarps = nranks <= 6 ? 6 : 4;
const int copyWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps)/2;
const int nThreadsScatter = copyWarps*WARP_SIZE;
const int nThreadsGather = (copyWarps-1)*WARP_SIZE;
const int nThreadsReduce = (reduceWarps+1)*WARP_SIZE;
const int tidEndScatter = nThreadsScatter;
const int tidEndGather = tidEndScatter + nThreadsGather;
const int tidEndReduce = tidEndGather + nThreadsReduce;
using Proto = ProtoSimple<1, 1, COLL_UNROLL, /*NVLS=*/true>;
if (tid < tidEndScatter) {
// Scatter
int group = (0*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndGather) {
// Gather
int group = (2*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
// Reduce, broadcast through NVLS
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
}
#endif // NCCL_NVLS_ENABLED
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
+1 -1
View File
@@ -52,7 +52,7 @@ namespace {
const T* sendbuff = (const T*)args->sendbuff + send_offset;
T* recvbuff = (T *)args->recvbuff + recv_offset;
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(
tid, nthreads, nullptr, false, 1, &sendbuff, 1, &recvbuff, send_recv_size);
tid, nthreads, 0, nullptr, false, 1, (void **)&sendbuff, 1, (void **)&recvbuff, send_recv_size);
} else {
for (ssize_t prims_offset = 0; prims_offset < send_recv_size; prims_offset += prims_size) {
const int prims_nelem = min(prims_size, send_recv_size - prims_offset);
+66 -153
View File
@@ -29,14 +29,15 @@
#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
#define NCCL_FUNC4(func, devredop, type, nullify) \
NCCL_FUNC5(func, TREE, devredop, type, nullify), \
NCCL_FUNC5(func, RING, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify)
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS, devredop, type, nullify)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
@@ -113,94 +114,8 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
#endif
};
#define NCCL_FUNC5_LL128(func, algo, devredop, type, nullify) \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
#define NCCL_FUNC4_LL128(func, devredop, type, nullify) \
NCCL_FUNC5_LL128(func, TREE, devredop, type, nullify), \
NCCL_FUNC5_LL128(func, RING, devredop, type, nullify), \
NCCL_FUNC5_LL128(func, COLLNET_DIRECT, devredop, type, nullify), \
NCCL_FUNC5_LL128(func, COLLNET_CHAIN, devredop, type, nullify)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A_LL128(func, devredop, nullForFloat) \
NCCL_FUNC4_LL128(func, devredop, int8_t, 0), \
NCCL_FUNC4_LL128(func, devredop, uint8_t, 0), \
NCCL_FUNC4_LL128(func, devredop, int32_t, 0), \
NCCL_FUNC4_LL128(func, devredop, uint32_t, 0), \
NCCL_FUNC4_LL128(func, devredop, int64_t, 0), \
NCCL_FUNC4_LL128(func, devredop, uint64_t, 0), \
NCCL_FUNC4_LL128(func, devredop, half, nullForFloat), \
NCCL_FUNC4_LL128(func, devredop, float, nullForFloat), \
NCCL_FUNC4_LL128(func, devredop, double, nullForFloat), \
NCCL_FUNC4_LL128(func, devredop, rccl_bfloat16, nullForFloat)
#define NCCL_FUNCS3B_LL128(func, devredop) \
NCCL_FUNC4_LL128(func, devredop, int8_t, 0), \
NCCL_FUNC4_LL128(func, devredop, int8_t, 0), \
NCCL_FUNC4_LL128(func, devredop, int8_t, 0), \
NCCL_FUNC4_LL128(func, devredop, int8_t, 0), \
NCCL_FUNC4_LL128(func, devredop, int8_t, 0), \
NCCL_FUNC4_LL128(func, devredop, int8_t, 0), \
NCCL_FUNC4_LL128(func, devredop, int8_t, 0), \
NCCL_FUNC4_LL128(func, devredop, int8_t, 0), \
NCCL_FUNC4_LL128(func, devredop, int8_t, 0), \
NCCL_FUNC4_LL128(func, devredop, int8_t, 0)
// Must be consistent with ncclRedOp_t
#define NCCL_FUNCS2A_LL128(func) \
NCCL_FUNCS3A_LL128(func, Sum, /*nullForFloat=*/0), \
NCCL_FUNCS3A_LL128(func, Prod, /*nullForFloat=*/0), \
NCCL_FUNCS3A_LL128(func, Max, /*nullForFloat=*/0), \
NCCL_FUNCS3A_LL128(func, Min, /*nullForFloat=*/0), \
NCCL_FUNCS3A_LL128(func, PreMulSum, /*nullForFloat=*/0), \
NCCL_FUNCS3A_LL128(func, SumPostDiv, /*nullForFloat=*/1)
#define NCCL_FUNCS2B_LL128(func) \
NCCL_FUNCS3B_LL128(func, Sum), \
NCCL_FUNCS3B_LL128(func, Sum), \
NCCL_FUNCS3B_LL128(func, Sum), \
NCCL_FUNCS3B_LL128(func, Sum), \
NCCL_FUNCS3B_LL128(func, Sum), \
NCCL_FUNCS3B_LL128(func, Sum)
// Must be consistent with the ncclFuncSet enum
using ncclKernelFunc_t = void (*)();
static const __device__ constexpr ncclKernelFunc_t ncclFuncs_ll128[]{
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if defined(__HIP_DEVICE_COMPILE__)
#if defined(BUILD_ALLREDUCE_ONLY)
NCCL_FUNC4_LL128(AllReduce, Sum, float, 0),
#else
NCCL_FUNCS2B_LL128(Broadcast),
NCCL_FUNCS2A_LL128(Reduce),
NCCL_FUNCS2B_LL128(AllGather),
NCCL_FUNCS2A_LL128(ReduceScatter),
NCCL_FUNCS2A_LL128(AllReduce),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, half),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, float),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, double),
#if defined(RCCL_BFLOAT16)
NCCL_ONERANK_REDUCE_NAME(PreMulSum, rccl_bfloat16),
#endif
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
NCCL_FUNC_NAME(AllToAllPivot, RING, SIMPLE, Sum, int8_t),
#endif
#endif
};
static_assert(FUNC_INDEX_P2P == 3610, "Wrong P2P function index");
static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 3611, "Wrong AllToAllPivot function index");
static_assert(FUNC_INDEX_P2P == 4510, "Wrong P2P function index");
static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 4511, "Wrong AllToAllPivot function index");
#ifndef USE_INDIRECT_FUNCTION_CALL
template<unsigned short f, unsigned short l, bool u>
@@ -217,7 +132,7 @@ struct Caller {
template<unsigned short f, bool u>
struct Caller<f, f + 1, u>{
static __forceinline__ __device__ __host__
void call(unsigned short funcIndex) noexcept { if (u) ncclFuncs_ll128[f](); else ncclFuncs[f](); }
void call(unsigned short funcIndex) noexcept { ncclFuncs[f](); }
};
template<bool USING_LL128>
@@ -260,46 +175,46 @@ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
else
assert("Unsupported function index");
#else
if (funcIndex < 720) {
if (funcIndex % 12 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 12 == 1) ncclFunction_Broadcast_TREE_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 12 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
else if (funcIndex % 12 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t();
else if (funcIndex % 12 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 12 == 4) ncclFunction_Broadcast_RING_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 12 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
else if (funcIndex % 12 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t();
else if (funcIndex % 12 == 6) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 12 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 12 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
else if (funcIndex % 12 == 8) ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
else if (funcIndex % 12 == 9) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 12 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 12 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
if (funcIndex < 900) {
if (funcIndex % 15 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 1) ncclFunction_Broadcast_TREE_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
else if (funcIndex % 15 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t();
else if (funcIndex % 15 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 4) ncclFunction_Broadcast_RING_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
else if (funcIndex % 15 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t();
else if (funcIndex % 15 == 6) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
else if (funcIndex % 15 == 8) ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
else if (funcIndex % 15 == 9) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
else ncclFunction_Broadcast_COLLNET_CHAIN_SIMPLE_Sum_int8_t();
}
else if (funcIndex < 1440) Caller<720, 1440, USING_LL128>::call(funcIndex);
else if (funcIndex < 2160) {
if (funcIndex % 12 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 12 == 1) ncclFunction_AllGather_TREE_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 12 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
else if (funcIndex % 12 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t();
else if (funcIndex % 12 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 12 == 4) ncclFunction_AllGather_RING_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 12 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t();
else if (funcIndex % 12 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t();
else if (funcIndex % 12 == 6) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 12 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 12 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
else if (funcIndex % 12 == 8) ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
else if (funcIndex % 12 == 9) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 12 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 12 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
else if (funcIndex < 1800) Caller<900, 1800, USING_LL128>::call(funcIndex);
else if (funcIndex < 2700) {
if (funcIndex % 15 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 1) ncclFunction_AllGather_TREE_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
else if (funcIndex % 15 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t();
else if (funcIndex % 15 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 4) ncclFunction_AllGather_RING_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t();
else if (funcIndex % 15 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t();
else if (funcIndex % 15 == 6) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
else if (funcIndex % 15 == 8) ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
else if (funcIndex % 15 == 9) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 15 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 15 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
else ncclFunction_AllGather_COLLNET_CHAIN_SIMPLE_Sum_int8_t();
}
else if (funcIndex < 3600) Caller<2160, 3600, USING_LL128>::call(funcIndex);
else if (funcIndex < 4500) Caller<2700, 4500, USING_LL128>::call(funcIndex);
else {
switch (funcIndex - 3600) {
switch (funcIndex - 4500) {
case 0:
ncclFunction_OneRankReduce_PreMulSum_int8_t();
break;
@@ -479,22 +394,19 @@ class ncclFunction {
#define traceData(data2, data4, data8_0, data8_1)
#endif
struct ncclShmemGroup {
ncclConnInfo *recvConns[NCCL_MAX_DIRECT_ARITY];
ncclConnInfo *sendConns[NCCL_MAX_DIRECT_ARITY];
void* srcs[NCCL_MAX_DIRECT_ARITY+1];
void* dsts[NCCL_MAX_DIRECT_ARITY+1];
int totalSendSize[NCCL_MAX_SLICE_PER_CHUNK];
ncclConnInfo *recvConns[NCCL_MAX_NVLS_ARITY];
ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
void* srcs[NCCL_MAX_NVLS_ARITY+1];
void* dsts[NCCL_MAX_NVLS_ARITY+1];
int nvlsRecv;
uint64_t barrier;
uint64_t barrier_next[NCCL_MAX_GROUPS];
};
struct ncclShmemData {
union {
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
};
uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
int channelId;
int aborted;
alignas(16) struct ncclDevComm comm;
@@ -507,6 +419,15 @@ struct ncclShmemData {
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "ncclShmem.work needs to be 16B aligned");
extern __shared__ ncclShmemData ncclShmem;
#if __CUDA_ARCH__ >= 700
extern __shared__ ulong2 ncclShmemPerWarp[/*ncclShmemDynamicSize()/sizeof(ulong2)*/];
#else
extern __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
#endif
__device__ inline void* ncclScratchForWarp(int warp) {
return (char*)ncclShmemPerWarp + warp*ncclShmemScratchWarpSize();
}
#ifdef ENABLE_PROFILING
#define __insert_timestamp(line_num) do { \
@@ -578,7 +499,7 @@ static __forceinline__ __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we
}
}
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex, bool COLLTRACE, bool USING_LL128>
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex, bool COLLTRACE>
__forceinline__ __device__ void ncclKernel(
struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
) {
@@ -670,10 +591,9 @@ __forceinline__ __device__ void ncclKernel(
RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
} else {
#ifdef USE_INDIRECT_FUNCTION_CALL
if (USING_LL128) ncclFuncs_ll128[ncclShmem.work.header.funcIndex]();
else ncclFuncs[ncclShmem.work.header.funcIndex]();
ncclFuncs[ncclShmem.work.header.funcIndex]();
#else
NCCL_CALL_FUNCTIONS<USING_LL128>(ncclShmem.work.header.funcIndex);
NCCL_CALL_FUNCTIONS<1>(ncclShmem.work.header.funcIndex);
#endif
}
@@ -705,22 +625,12 @@ __forceinline__ __device__ void ncclKernel(
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, false>(comm, channelMask, workHead); \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false>(comm, channelMask, workHead); \
} \
\
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, false>(comm, channelMask, workHead); \
} \
\
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, true>(comm, channelMask, workHead); \
} \
\
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, true>(comm, channelMask, workHead); \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true>(comm, channelMask, workHead); \
}
// Examples : AllReduce, RING, LL, Sum, uint8
@@ -748,7 +658,8 @@ __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev
IMPL_COLL4(func, TREE, devredop, type, ncclType) \
IMPL_COLL4(func, RING, devredop, type, ncclType) \
IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \
IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType)
IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType) \
IMPL_COLL4(func, NVLS, devredop, type, ncclType)
#define IMPL_COLL2(func, devredop) \
IMPL_COLL3(func, devredop, int8_t, ncclInt8) \
@@ -791,4 +702,6 @@ __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev
#define IMPL_COLL_F(func) \
IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t);
#define NCCL_NVLS_ENABLED (__CUDA_ARCH__ >= 900 && NCCL_NVLS_SUPPORTS(NCCL_TYPE, NCCL_OP))
#endif
File diff suppressed because it is too large Load Diff
+5 -1
View File
@@ -10,6 +10,9 @@
#include "common.h"
__shared__ ncclShmemData ncclShmem;
#if __CUDA_ARCH__ < 700
__shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
#endif
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
@@ -22,7 +25,8 @@ __shared__ ncclShmemData ncclShmem;
NCCL_FUNC5(func, TREE, devredop, type, nullify), \
NCCL_FUNC5(func, RING, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify)
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS, devredop, type, nullify)
#if defined(__CUDA_BF16_TYPES_EXIST__)
// Must be consistent with ncclDataType_t
+1 -1
View File
@@ -114,7 +114,7 @@ __device__ __forceinline__ static void threadBlockCopy(
for (int r = 0; r < numloops; r++) { \
srcOffset = srcBaseOffset + (ssize_t)mscclShmem.mscclTB.reductionSrcOffsets[t->reductionPointer+r] * sizePerMscclChunk; \
reduceInput = load(srcPointer + srcOffset); \
o = redFn(reduceInput, o); \
o = applyReduce(redFn, reduceInput, o); \
}
#define MSCCL_REDUCE_UNROLL_LOOP_B(numloops) \
+5 -3
View File
@@ -7,7 +7,7 @@
#include "devcomm.h"
#include "collectives.h"
#include "reduce_kernel.h"
#include "common_kernel.h"
#include "common.h"
namespace {
@@ -40,8 +40,10 @@ namespace {
i1 = i1 < eltN ? i1 : eltN;
src += i0;
dst += i0;
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 1>
(tid, tn, &(we->redOpArg), true, 1, &src, 1, &dst, i1-i0);
void *vsrc = (void*)src;
void *vdst = (void*)dst;
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/1>
(tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0);
}
}
}
+290 -15
View File
@@ -8,29 +8,27 @@
#define OP128_H_
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
: "=l"(v0), "=l"(v1) : "l"(ptr));
v0 = __builtin_nontemporal_load(ptr);
v1 = __builtin_nontemporal_load(ptr+1);
}
inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};"
:: "l"(v0), "l"(v1), "l"(ptr));
__builtin_nontemporal_store(v0, ptr);
__builtin_nontemporal_store(v1, ptr+1);
}
inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
uint64_t* shmemAsmPtr;
asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr));
return shmemAsmPtr;
return (uint64_t*)shmemGenericPtr;
}
inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];"
: "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr));
v0 = *(shmemAsmPtr);
v1 = *(shmemAsmPtr+1);
}
inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};"
:: "l"(v0), "l"(v1), "l"(shmemAsmPtr));
*(shmemAsmPtr) = v0;
*(shmemAsmPtr+1) = v1;
}
template<typename T>
@@ -46,23 +44,300 @@ inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1
// Produce 4 bytes of sub-register type by reading 2 4-byte
// aligned values and shifting.
uint32_t lo, hi;
asm("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0));
asm("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1));
lo = __builtin_nontemporal_load(ptr4+e+0);
hi = __builtin_nontemporal_load(ptr4+e+1);
tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast<uintptr_t>(ptr))%4));
}
}
else if(sizeof(T) == 4) {
#pragma unroll
for(int e=0; e < 4; e++)
asm("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e));
tmp4[e] = __builtin_nontemporal_load(ptr+e);
}
else /*sizeof(T)==8*/ {
#pragma unroll
for(int e=0; e < 2; e++)
asm("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e));
tmp8[e] = __builtin_nontemporal_load(ptr+e);
}
v0 = tmp8[0];
v1 = tmp8[1];
}
template<typename T>
__device__ __forceinline__ uint32_t cvta_to_shared(T* ptr) {
return (uint32_t)(uint64_t)(ptr);
}
template<typename T>
__device__ __forceinline__ uintptr_t cvta_to_global(T* ptr) {
return (uintptr_t)(ptr);
}
template<typename T>
__device__ __forceinline__ T* cvta_from_shared(uint32_t shptr) {
return (T*)shptr;
}
template<typename T>
__device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) {
return (T*)gptr;
}
////////////////////////////////////////////////////////////////////////////////
// BytePack<Size>: struct of bytes.
template<int Size>
union BytePack;
template<>
union BytePack<1> {
uint8_t u8, native;
};
template<>
union BytePack<2> {
BytePack<1> half[2];
uint8_t u8[2];
uint16_t u16, native;
};
template<>
union BytePack<4> {
BytePack<2> half[2];
uint8_t u8[4];
uint16_t u16[2];
uint32_t u32, native;
};
template<>
union BytePack<8> {
BytePack<4> half[2];
uint8_t u8[8];
uint16_t u16[4];
uint32_t u32[2];
uint64_t u64, native;
};
template<>
union alignas(16) BytePack<16> {
BytePack<8> half[2];
uint8_t u8[16];
uint16_t u16[8];
uint32_t u32[4];
uint64_t u64[2];
ulong2 ul2, native;
#ifndef USE_INDIRECT_FUNCTION_CALL
inline __device__ BytePack<16>& operator=(BytePack<16> other) {
u64[0] = other.u64[0];
u64[1] = other.u64[1];
return *this;
}
#endif
};
template<typename T>
__device__ __forceinline__ BytePack<sizeof(T)> toPack(T value) {
union { BytePack<sizeof(T)> p; T v; };
v = value;
return p;
}
template<typename T>
__device__ __forceinline__ T fromPack(BytePack<sizeof(T)> pack) {
union { BytePack<sizeof(T)> p; T v; };
p = pack;
return v;
}
////////////////////////////////////////////////////////////////////////////////
// Load/store of BytePack<?> using integral addresses.
template<int Size> __device__ BytePack<Size> ld_global(uintptr_t addr);
template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
//template<int Size> __device__ BytePack<Size> ld_shared(uint32_t addr);
//template<int Size> __device__ BytePack<Size> ld_volatile_shared(uint32_t addr);
template<int Size> __device__ void st_global(uintptr_t addr, BytePack<Size> value);
//template<int Size> __device__ void st_shared(uint32_t addr, BytePack<Size> value);
// Used to define implementations for above prototypes.
#define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
template<> \
__device__ __forceinline__ BytePack<bytes> ld_##space<bytes>(addr_cxx_ty addr) { \
data_cxx_ty tmp; \
tmp = *((data_cxx_ty *)addr); \
BytePack<bytes> ans; \
ans.native = tmp; \
return ans; \
} \
template<> \
__device__ __forceinline__ BytePack<bytes> ld_volatile_##space<bytes>(addr_cxx_ty addr) { \
data_cxx_ty tmp; \
tmp = __builtin_nontemporal_load((data_cxx_ty *)addr); \
BytePack<bytes> ans; \
ans.native = tmp; \
return ans; \
} \
template<> \
__device__ __forceinline__ void st_##space<bytes>(addr_cxx_ty addr, BytePack<bytes> value) { \
data_cxx_ty tmp = value.native; \
*((data_cxx_ty *)addr) = tmp; \
}
// Single-byte types use 4-byte registers since there is no 1-byte register
// character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
DEFINE_ld_st(1, uint8_t, b8, r, global, uintptr_t, l)
//DEFINE_ld_st(1, uint32_t, b8, r, shared, uint32_t, r)
DEFINE_ld_st(2, uint16_t, b16, h, global, uintptr_t, l)
//DEFINE_ld_st(2, uint16_t, b16, h, shared, uint32_t, r)
DEFINE_ld_st(4, uint32_t, b32, r, global, uintptr_t, l)
//DEFINE_ld_st(4, uint32_t, b32, r, shared, uint32_t, r)
DEFINE_ld_st(8, uint64_t, b64, l, global, uintptr_t, l)
//DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r)
#undef DEFINE_ld_st
#define DEFINE_ld_st_16(space, addr_cxx_ty, addr_reg_ty) \
template<> \
__device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \
BytePack<16> ans; \
ans.u64[0] = *((uint64_t*)addr); \
ans.u64[1] = *((uint64_t*)addr+1); \
return ans; \
} \
template<> \
__device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \
BytePack<16> ans; \
ans.u64[0] = __builtin_nontemporal_load((uint64_t*)addr); \
ans.u64[1] = __builtin_nontemporal_load((uint64_t*)addr+1); \
return ans; \
} \
template<> \
__device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \
*((uint64_t*)addr) = value.u64[0]; \
*((uint64_t*)addr+1) = value.u64[1]; \
}
DEFINE_ld_st_16(global, uintptr_t, l)
//DEFINE_ld_st_16(shared, uint32_t, r)
#undef DEFINE_ld_st_16
////////////////////////////////////////////////////////////////////////////////
// Atomic load/store using c++ pointers.
__device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) {
uint64_t ans;
ans = __builtin_nontemporal_load(ptr);
return ans;
}
__device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
uint64_t ans;
ans = __builtin_nontemporal_load(ptr);
return ans;
}
__device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) {
uint64_t ans;
ans = __atomic_load_n(ptr ,__ATOMIC_SEQ_CST);
return ans;
}
__device__ __forceinline__ void st_volatile_global(uint64_t *ptr, uint64_t val) {
__builtin_nontemporal_store(val, ptr);
}
__device__ __forceinline__ void st_relaxed_sys_global(uint64_t *ptr, uint64_t val) {
__builtin_nontemporal_store(val, ptr);
}
__device__ __forceinline__ void st_release_sys_global(uint64_t *ptr, uint64_t val) {
__atomic_store_n(ptr, val, __ATOMIC_SEQ_CST);
}
__device__ __forceinline__ void fence_acq_rel_sys() {
//asm volatile("membar.sys;" ::: "memory");
}
__device__ __forceinline__ void fence_acq_rel_gpu() {
//asm volatile("membar.gl;" ::: "memory");
}
////////////////////////////////////////////////////////////////////////////////
// Multimem stores of BytePack<?>.
template<int Size>
__device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size> val);
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
template<>
__device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) {
asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory");
}
template<>
__device__ __forceinline__ void multimem_st_global<8>(uintptr_t addr, BytePack<8> val) {
asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.u64) : "memory");
}
template<>
__device__ __forceinline__ void multimem_st_global<16>(uintptr_t addr, BytePack<16> val) {
asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};"
:: "l"(addr), "r"(val.u32[0]), "r"(val.u32[1]), "r"(val.u32[2]), "r"(val.u32[3])
: "memory");
}
#else
template<int Size>
__device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size> val) {
// nop
}
#endif
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
// Warp-uniform memory copy from shared address (not generic) to global memory.
// The number of bytes copied is `min(MaxBytes, nBytesAhead)`, a negative value
// is interpeted as zero. EltSize is the guaranteed alignment of the addresses and sizes.
template<int EltSize, int MaxBytes, bool Multimem, typename IntBytes>
__device__ __forceinline__ void copyGlobalShared_WarpUnrolled(
int lane, uintptr_t dstAddr, uint32_t srcAddr, IntBytes nBytesAhead
) {
static_assert(std::is_signed<IntBytes>::value, "`IntBytes` must be a signed integral type.");
int nBytes = min(nBytesAhead, (IntBytes)MaxBytes);
int nFrontBytes = min(nBytes, (16 - int(dstAddr%16))%16);
int nMiddleBytes = (nBytes-nFrontBytes) & -16;
int nBackBytes = (nBytes-nFrontBytes) % 16;
{ int backLane = WARP_SIZE-1 - lane;
bool hasFront = lane*EltSize < nFrontBytes;
bool hasBack = backLane*EltSize < nBackBytes;
int offset = hasFront ? lane*EltSize : (nBytes - (backLane+1)*EltSize);
if (hasFront | hasBack) {
BytePack<EltSize> tmp = ld_shared<EltSize>(srcAddr+offset);
// Can't use multimem_st since it doesn't support EltSize==2
st_global<EltSize>(dstAddr+offset, tmp);
}
}
srcAddr += nFrontBytes;
int srcMisalign = EltSize < 4 ? (srcAddr%4) : 0;
srcAddr += -srcMisalign + lane*16;
dstAddr += nFrontBytes + lane*16;
nMiddleBytes -= lane*16;
#pragma unroll
for (int u=0; u < divUp(MaxBytes, WARP_SIZE*16); u++) {
if (nMiddleBytes <= 0) break;
union {
BytePack<4> b4[4];
BytePack<16> b16;
};
b4[0] = ld_shared<4>(srcAddr + 0*4);
b4[1] = ld_shared<4>(srcAddr + 1*4);
b4[2] = ld_shared<4>(srcAddr + 2*4);
b4[3] = ld_shared<4>(srcAddr + 3*4);
if (srcMisalign != 0) {
BytePack<4> b4_4 = ld_shared<4>(srcAddr + 4*4);
b4[0].u32 = __funnelshift_r(b4[0].u32, b4[1].u32, srcMisalign*8);
b4[1].u32 = __funnelshift_r(b4[1].u32, b4[2].u32, srcMisalign*8);
b4[2].u32 = __funnelshift_r(b4[2].u32, b4[3].u32, srcMisalign*8);
b4[3].u32 = __funnelshift_r(b4[3].u32, b4_4.u32, srcMisalign*8);
}
if (Multimem) multimem_st_global<16>(dstAddr, b16);
else st_global<16>(dstAddr, b16);
srcAddr += WARP_SIZE*16;
dstAddr += WARP_SIZE*16;
nMiddleBytes -= WARP_SIZE*16;
}
}
#else
template<int EltSize, int MaxBytes, bool Multimem, typename IntBytes>
__device__ __forceinline__ void copyGlobalShared_WarpUnrolled(
int lane, uintptr_t dstAddr, uint32_t srcAddr, IntBytes nBytesAhead
) {
// nop
}
#endif
#endif
+3 -1
View File
@@ -10,6 +10,7 @@
#include <type_traits>
#include "reduce_kernel.h" // for reduction funcs
#include "common_kernel.h"
#include "common.h"
#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
@@ -37,12 +38,13 @@
* to how that protocol operates with a consistent interface so that our
* algorithm code can operate protocol parametrically.
*/
template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL>
template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL, bool NVLS_1 = false>
struct ProtoSimple {
static constexpr int Id = NCCL_PROTO_SIMPLE;
static constexpr int SlicePerChunk = SlicePerChunk_1;
static constexpr int StepPerSlice = StepPerSlice_1;
static constexpr int Unroll = Unroll_1;
static constexpr bool NVLS = NVLS_1;
// Data bytes (no flags etc) in one step of the fifo queue.
__device__ static int calcBytePerStep() {
+7 -12
View File
@@ -77,11 +77,6 @@ private:
#endif
}
static inline __device__ uint32_t __funnelshift_r(uint32_t lo, uint32_t hi, uint32_t shift) {
uint64_t val64 = ((uint64_t)lo+((uint64_t)hi<<32))>>(shift&31);
return (uint32_t)val64;
}
uint32_t abort = 0;
inline __device__ int checkAbort(int &spins, int send) {
@@ -426,18 +421,18 @@ private:
}
if (SRC) {
data = dl.loadFinish();
if (SrcBuf == Input) data = MULTI<RedOp, T>().preOp(redOp, data);
if (SrcBuf == Input) data = applyPreOp(redOp, data);
}
if (RECV) {
data = !SRC ? peerData : MULTI<RedOp,T>()(redOp, peerData, data);
#pragma unroll
data = !SRC ? peerData : applyReduce(redOp, peerData, data);
#pragma unroll MaxRecv
for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
peerData = readLLFinish(offset, line, i);
data = MULTI<RedOp,T>()(redOp, peerData, data);
data = applyReduce(redOp, peerData, data);
}
}
if (postOp) data = MULTI<RedOp, T>().postOp(redOp, data);
if (postOp) data = applyPostOp(redOp, data);
// Send : inter-node, then intra-node, then local
if (SEND) {
@@ -511,13 +506,13 @@ private:
uint64_t dataD;
dl.loadBegin(dstElts, eltInLine);
dataD = dl.loadFinish();
dataD = MULTI<RedOp,T>()(redOp, dataD, data);
dataD = applyReduce(redOp, dataD, data);
if (MULTISRCS){
for (int i = 1; i < nsrcs; i++){
dl.loadBegin(srcs[i], eltInLine);
srcs[i] += eltPerTrip;
data = dl.loadFinish();
dataD = MULTI<RedOp,T>()(redOp, dataD, data);
dataD = applyReduce(redOp, dataD, data);
}
}
mscclStoreData(dstElts, dataD, eltInLine);
+17 -21
View File
@@ -249,9 +249,9 @@ private:
if (SrcBuf == Input) {
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = MULTI<RedOp, T>().preOp(redOp, v[u]);
v[u] = applyPreOp(redOp, v[u]);
if (!flagThread)
v[u+1] = MULTI<RedOp, T>().preOp(redOp, v[u+1]);
v[u+1] = applyPreOp(redOp, v[u+1]);
}
}
}
@@ -262,8 +262,8 @@ private:
uint64_t* ptr = recvPtr(0)+ll128Offset;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = SRC ? MULTI<RedOp, T>()(redOp, vr[u], v[u]) : vr[u];
v[u+1] = SRC ? MULTI<RedOp, T>()(redOp, vr[u+1], v[u+1]) : vr[u+1];
v[u] = SRC ? applyReduce(redOp, vr[u], v[u]) : vr[u];
v[u+1] = SRC ? applyReduce(redOp, vr[u+1], v[u+1]) : vr[u+1];
}
}
@@ -283,20 +283,24 @@ private:
needReload &= (0 == checkAbort(spins, i, 0));
} while (__any(needReload));
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2)
load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = MULTI<RedOp, T>()(redOp, vr[u], v[u]);
v[u+1] = MULTI<RedOp, T>()(redOp, vr[u+1], v[u+1]);
v[u] = applyReduce(redOp, vr[u], v[u]);
v[u+1] = applyReduce(redOp, vr[u+1], v[u+1]);
}
}
}
/********************** End Recv ************************/
if (postOp && !FuncTraits<RedOp>::IsPostOpIdentity) {
if (postOp) {
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = MULTI<RedOp, T>().postOp(redOp, v[u]);
v[u+1] = MULTI<RedOp, T>().postOp(redOp, v[u+1]);
v[u] = applyPostOp(redOp, v[u]);
v[u+1] = applyPostOp(redOp, v[u+1]);
}
}
@@ -332,14 +336,6 @@ private:
__device__ __forceinline__ void GenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
constexpr int SRC = SrcBuf != -1 ? 1 : 0;
constexpr int DST = DstBuf != -1 ? 1 : 0;
static_assert(-1<=SrcBuf && SrcBuf < 2, "Uhoh");
static_assert(-1<=DstBuf && DstBuf < 2, "Uhoh");
static_assert(DstBuf!=Input, "Mistake?");
#if 0
assert((SrcBuf==-1) == (srcIx==-1));
assert((DstBuf==-1) == (dstIx==-1));
#endif
T const *srcPtr = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx;
T *dstPtr = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx;
int wireOffset = WireWordPerSlice*warp + 2*wid;
@@ -403,18 +399,18 @@ private:
loadRegsFinish(regsD);
#pragma unroll
for (int u=0; u<NCCL_LL128_SHMEM_ELEMS_PER_THREAD; u+=2) {
regsD[u] = MULTI<RedOp, T>()(redOp, regs[u], regsD[u]);
regsD[u] = applyReduce(redOp, regs[u], regsD[u]);
if (!flagThread)
regsD[u+1] = MULTI<RedOp, T>()(redOp, regs[u+1], regsD[u+1]);
regsD[u+1] = applyReduce(redOp, regs[u+1], regsD[u+1]);
}
if (MULTISRCS){
for (int i = 1; i < nsrcs; i++){
loadRegsBegin(regs, srcs[i], eltInSlice);
loadRegsFinish(regs);
for (int u=0; u<NCCL_LL128_SHMEM_ELEMS_PER_THREAD; u+=2) {
regsD[u] = MULTI<RedOp, T>()(redOp, regs[u], regsD[u]);
regsD[u] = applyReduce(redOp, regs[u], regsD[u]);
if (!flagThread)
regsD[u+1] = MULTI<RedOp, T>()(redOp, regs[u+1], regsD[u+1]);
regsD[u+1] = applyReduce(redOp, regs[u+1], regsD[u+1]);
}
}
}
+110 -104
View File
@@ -13,9 +13,9 @@
#include "msccl/msccl_struct.h"
template<typename T, typename RedOp, typename Fan, int Direct,
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p>
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, bool NVLS>
class Primitives<
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll>, P2p
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, NVLS>, P2p
> {
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1;
@@ -30,8 +30,10 @@ class Primitives<
SizesFifoEnabled = 0x100,
DirectWrite = 0x200,
DirectRead = 0x400,
ThreadsSynced = 0x800;
const int tid;
ThreadsSynced = 0x800,
NvlsMinPolling = 0x1000,
NvlsRecv = 0x2000;
const int tid, tidInBlock;
int nthreads;
int nworkers;
const int stepSize;
@@ -49,7 +51,7 @@ class Primitives<
int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled)
T *directBuff; // !(flags & SizesFifoEnabled)
};
uint64_t volatile *connStepPtr;
uint64_t *connStepPtr;
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
uint64_t* barriers;
uint64_t* barrier_next;
@@ -66,28 +68,15 @@ private:
// Don't use barrier 0 as it's used by the final sync
inline __device__ void barrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
flags |= ThreadsSynced;
if (nthreads == WARP_SIZE)
__syncwarp();
else
barrier_by_group();
#else
if (nthreads == WARP_SIZE)
__syncwarp();
else
asm volatile("bar.sync %0, %1;" :: "r"(15-group), "r"(nthreads));
#endif
flags |= ThreadsSynced;
}
inline __device__ void subBarrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
barrier();
#else
if (nworkers == nthreads)
barrier();
else
asm volatile("bar.sync %0, %1;" :: "r"(8-group), "r"(nworkers));
#endif
}
inline __device__ bool checkAbort(int &spins) {
@@ -102,6 +91,19 @@ private:
return flags & Aborted;
}
inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
if (NVLS && (flags & NvlsMinPolling)) {
uint64_t ans;
asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
return ans;
}
#endif
// volatile is faster than acquire but not as correct. Make sure ReduceOrCopyMulti
// loads data using volatile so it doesn't see stale data in L1.
return atomicAdd((unsigned long long *)ptr, 0);
}
template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
__device__ __forceinline__ void waitPeer(intptr_t dstIx, intptr_t remoteIx, int offset, int nelts) {
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
@@ -112,7 +114,7 @@ private:
int spins = 0;
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
__builtin_amdgcn_s_sleep(1);
connStepCache = atomicAdd((unsigned long long *)connStepPtr, 0);
connStepCache = loadStepValue(connStepPtr);
if (checkAbort(spins)) break;
//if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
if (spins == 0) traceData(__LINE__, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
@@ -153,12 +155,18 @@ private:
}
template<int Recv, int Send>
inline __device__ void postPeer() {
inline __device__ void postPeer(bool dataStored) {
if ((flags & Send*RolePostSend) && next_hdp_reg)
STORE((unsigned int *)next_hdp_reg, 0x1);
if (flags & (Recv*RolePostRecv | Send*RolePostSend)) {
step += StepPerSlice;
if (Send && (flags & RolePostSend) && dataStored)
#ifdef __GFX9__
__asm__ __volatile__("buffer_wbinvl1_vol");
#else
__threadfence_system();
#endif
STORE(connStepPtr, step);
}
}
@@ -202,6 +210,7 @@ private:
// barrier();
// post();
// } // Since we no longer unroll, new branch added here
#pragma unroll 1
do {
sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
if (Src && (flags & (SrcBuf==Input ? RoleInput : RoleOutput)))
@@ -212,8 +221,13 @@ private:
subBarrier();
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
* to 0 to avoid unnecessary workload. */
size_t workSize = ncclShmem.aborted ? 0 : sliceSize;
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
int workSize = ncclShmem.aborted ? 0 : sliceSize;
if (NVLS && ncclShmem.groups[group].nvlsRecv) {
void* src = ncclShmem.groups[group].srcs[0];
void* dst = ncclShmem.groups[group].dsts[0];
copyMultimemMultimem<RedOp>(tid, nworkers, ncclShmem.redOpArgs[0], postOp, src, dst, workSize,
cvta_to_shared(ncclScratchForWarp(tidInBlock/WARP_SIZE)));
} else if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (Send) {
@@ -230,11 +244,10 @@ private:
}
#endif
// (1-Send) is only there to avoid compilation errors in case MaxSend=0 (and Send=0).
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, (1-Send)+MaxSend, 0>
(tid, nworkers, nullptr, false,
1, (T const**)ncclShmem.groups[group].srcs,
fan.nsend(), (T**)ncclShmem.groups[group].dsts+1,
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, MaxSend, /*PreOpSrcs*/0>
(tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
1, ncclShmem.groups[group].srcs,
fan.nsend(), ncclShmem.groups[group].dsts+1,
workSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
@@ -254,7 +267,6 @@ private:
}
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
// For broadcast in CollNet to do empty send
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
@@ -268,10 +280,10 @@ private:
}
#endif
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs, postOp,
Recv, (T const**)ncclShmem.groups[group].srcs,
Dst, (T**)ncclShmem.groups[group].dsts,
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs*/0>
(tid, nworkers, ncclShmem.redOpArgs[0], nullptr, postOp,
Recv, ncclShmem.groups[group].srcs,
Dst, ncclShmem.groups[group].dsts,
workSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
@@ -289,7 +301,6 @@ private:
#endif
} else {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
@@ -303,12 +314,12 @@ private:
}
#endif
constexpr int PreOpN = SrcBuf != Input ? 0 :
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpN>
(tid, nworkers, ncclShmem.redOpArgs, postOp,
Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs,
Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts,
constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
workSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
@@ -327,13 +338,7 @@ private:
}
barrier(); // This barrier has a counterpart in following loop
#ifdef __GFX9__
if (Send && (flags & RolePostSend) && index == 0) __asm__ __volatile__("buffer_wbinvl1_vol");
#else
if (Send && (flags & RolePostSend) && index == 0) __threadfence_system();
#endif
__syncwarp();
postPeer<Recv, Send>();
postPeer<Recv, Send>(0 < sliceSize);
offset += sliceSize;
slice += 1;
} while (slice < SlicePerChunk && offset < nelem);
@@ -343,6 +348,7 @@ private:
// slices are all empty. Since empty slices are the uncommon case, and
// worker perf is the limiter, perf-wise this loop is effectively unentered,
// hence just a single branch insn.
#pragma unroll 1
while (slice < SlicePerChunk) {
sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
{ // Only workers could have Wait roles so we know the slice must be empty
@@ -350,13 +356,7 @@ private:
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
}
barrier(); // Has couterpart in preceding worker-only loop.
#ifdef __GFX9__
if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __asm__ __volatile__("buffer_wbinvl1_vol");
#else
if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system();
#endif
__syncwarp();
postPeer<Recv, Send>();
postPeer<Recv, Send>(0 < sliceSize);
offset += sliceSize;
slice += 1;
}
@@ -371,19 +371,19 @@ private:
nsrcs++;
if (MULTISRCS){
ReduceOrCopyMulti<Unroll, RedOp, T, 3, MSCCL_MAX_REDUCE_FUSION, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs, false, nsrcs, (T const**)srcs, 1, (T**)dsts, nelem);
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, nsrcs, (void **)srcs, 1, (void **)dsts, nelem);
} else {
ReduceOrCopyMulti<Unroll, RedOp, T, 2, 2, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs, false, 2, (T const**)srcs, 1, (T**)dsts, nelem);
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 2, (void **)srcs, 1, (void **)dsts, nelem);
}
}
if (COPY){
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs, false, 1, (T const**)srcs, 1, (T**)dsts, nelem);
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, (void **)srcs, 1, (void **)dsts, nelem);
if (MULTISRCS) {
for (int i = 1; i < nsrcs; i++){
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs, false, 1, (T const**)&srcs[i], 1, (T**)&dsts[i], nelem);
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, (void **)&srcs[i], 1, (void **)&dsts[i], nelem);
}
}
}
@@ -396,44 +396,46 @@ private:
// shift: peer offset to avoid all ranks sending to or receiving from same peer
template <int DirectRecv1, int DirectSend1, int Recv, int Send>
__device__ __forceinline__ void
ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp) {
ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp) {
constexpr int DirectRecv = 1 && Direct && DirectRecv1;
constexpr int DirectSend = 1 && Direct && DirectSend1;
int offset = 0; // slice offset
int sliceSize = stepSize*StepPerSlice;
int dataSize = max(DIVUP(peerElem, 16*SlicePerChunk)*16, sliceSize/32); // per-peer slice size
#pragma unroll 1
for (int slice=0; slice<SlicePerChunk; ++slice) {
int realSize = max(0, min(dataSize, peerElem-offset));
bool fenceNeeded = false;
if (tid < nworkers) {
if (Send) {
// Scatter pre-scales data of input buffer only in non-Direct case
constexpr int PreOpN = DirectSend ? 0 : 1;
constexpr int PreOpSrcs = DirectSend ? 0 : 1;
if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] = 0; // Skip the threadfence
// realSize is not accurate here; but intra-node does not rely on sizes FIFO
waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
subBarrier();
#pragma unroll 1
// Loop over peers
for (int j=0; j<fan.nsend(); j++) {
int i = (j+shift)%fan.nsend();
int peerOffset = i*peerElem;
int pOffset = i*peerOffset;
// Skip the data I am responsible of reducing myself
if (skip >= 0 && i >= skip) peerOffset += peerElem;
const T* src0 = (T*)ncclShmem.groups[group].srcs[0] + peerOffset;
int realPeerSize = min(realSize, totalElem-peerOffset);
if (skip >= 0 && i >= skip) pOffset += peerElem;
void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
int realPeerSize = min(realSize, totalElem-pOffset);
if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem.redOpArgs, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize);
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
// Mark for threadfence at the end
if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] += realPeerSize;
fenceNeeded |= true;
}
}
} else if (Recv) {
if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
int peerOffset = index*peerElem;
if (skip >= 0 && index >= skip) peerOffset += peerElem;
int pOffset = index*peerOffset;
if (skip >= 0 && index >= skip) pOffset += peerElem;
// Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+peerOffset, offset, realSize);
waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+pOffset, offset, realSize);
subBarrier();
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
// Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv
@@ -441,21 +443,17 @@ private:
} else {
for (int j=0; j<fan.nrecv(); j++) {
int i = (j+shift)%fan.nrecv();
peerOffset = i*peerElem;
if (skip >= 0 && i >= skip) peerOffset += peerElem;
T* dst0 = (T*)ncclShmem.groups[group].dsts[0] + peerOffset;
int realPeerSize = min(realSize, totalElem-peerOffset);
if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>(tid, nworkers, ncclShmem.redOpArgs, postOp, 1, (const T**)ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
pOffset = i*peerOffset;
if (skip >= 0 && i >= skip) pOffset += peerElem;
void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
int realPeerSize = min(realSize, totalElem-pOffset);
if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
}
}
}
}
barrier();
// If we indeed send something, threadfence
if (Send && (flags & RolePostSend) && ncclShmem.groups[group].totalSendSize[slice] > 0 && index == 0)
__threadfence_system();
__syncwarp();
postPeer<Recv, Send>();
fenceNeeded = __any(fenceNeeded);
postPeer<Recv, Send>(fenceNeeded);
offset += realSize;
}
}
@@ -471,25 +469,33 @@ private:
}
if (flags & RoleWaitRecv) {
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
if ((index == 0) && (flags & RoleWaitRecv)) {
if (conn->flags & NCCL_NVLS_MIN_POLL) {
flags |= NvlsMinPolling;
ncclShmem.groups[group].nvlsRecv = 1;
} else {
ncclShmem.groups[group].nvlsRecv = 0;
}
}
connStepPtr = conn->tail;
connStepCache = atomicAdd((unsigned long long *)connStepPtr, 0);
connStepCache = loadStepValue(connStepPtr);
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
if (Direct) {
// User buffers have been registered
if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
}
} else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
} else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
// direct read not allowed in non-register case
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
flags |= (conn->direct & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
}
}
}
@@ -511,8 +517,9 @@ private:
}
if (flags & RoleWaitSend) {
ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->head;
connStepCache = atomicAdd((unsigned long long *)connStepPtr, 0);
connStepCache = loadStepValue(connStepPtr);
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
if (flags & OffsFifoEnabled)
connOffsFifoPtr = conn->offsFifo;
@@ -523,20 +530,20 @@ private:
connSizesFifoPtr = conn->sizesFifo;
} else if (Direct) {
// User buffers have been registered
if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
}
} else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
} else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
// direct read not allowed in non-register case
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
flags |= (conn->direct & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
}
}
}
@@ -549,7 +556,7 @@ private:
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
):
tid(tid),
tid(tid), tidInBlock(threadIdx.x),
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
// For send operations, we need an extra warp to overlap the threadfence and the copy
@@ -566,7 +573,7 @@ private:
this->fan = Fan(nrecv, nsend);
constexpr int ThreadPerSync = 8;
static_assert(MaxSend < ThreadPerSync && MaxRecv < ThreadPerSync, "Not enough threads to cover all peers");
static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
int g = tid / ThreadPerSync;
int ng = nthreads / ThreadPerSync;
@@ -726,6 +733,9 @@ private:
genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp);
}
__device__ __forceinline__ void recvSend(int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, -1, -1>(-1, -1, -1, eltN, postOp);
}
__device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, -1, eltN, postOp);
}
@@ -756,25 +766,21 @@ private:
}
__device__ __forceinline__ void
scatter(intptr_t inpIx, int totalElem, int peerElem, int skip, int shift) {
ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false);
scatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
}
__device__ __forceinline__ void
directScatter(intptr_t inpIx, int totalElem, int peerElem, int skip, int shift) {
ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false);
directScatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
}
__device__ __forceinline__ void
gather(intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp=false) {
ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, skip, shift, postOp);
gather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp=false) {
ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, postOp);
}
__device__ __forceinline__ void
directGather(intptr_t outIx, int totalElem, int peerElem, int skip, int shift) {
ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, skip, shift, /*postOp=*/false);
}
__device__ __forceinline__ void recvSend(int eltN) {
genericOp<0, 0, 1, 1, -1, -1>(-1, -1, -1, eltN, /*postOp=*/false);
directGather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
}
// MSCCL primitives
File diff suppressed because it is too large Load Diff
+42
View File
@@ -92,3 +92,45 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROT
runRing<T, RedOp, ProtoLL128>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*chunkSize;
const int nThreadsScatter = 128 + WARP_SIZE;
const int nThreadsReduce = 384;
const int tidEndScatter = nThreadsScatter;
const int tidEndReduce = tidEndScatter + nThreadsReduce;
using Proto = ProtoSimple<1, 1>;
if (tid < tidEndScatter) {
// Scatter
int group = (0*Proto::MaxGroupWidth) | (0<<16);
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0);
}
} else if (tid < tidEndReduce) {
int group = (3*Proto::MaxGroupWidth) | (1<<16);
// Reduce through MC
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recv(offset, nelem);
}
}
}
};
+5 -2
View File
@@ -17,7 +17,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
template<typename Proto>
__device__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
size_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
#if defined(ENABLE_NPKIT)
bool isNpKitThread = (tid == 0);
@@ -43,7 +43,8 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
}
#endif
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&buff, 1, (T**)&recvBuff, count);
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>
(tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
if (isNpKitThread) {
@@ -59,6 +60,8 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
}
#endif
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, /*PreOpSrcs=*/0>
(tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
}
} else {
int chunkSize = args->chunkSize/sizeof(T);
+2
View File
@@ -74,6 +74,8 @@ void ncclDebugInit() {
mask = NCCL_ALLOC;
} else if (strcasecmp(subsys, "CALL") == 0) {
mask = NCCL_CALL;
} else if (strcasecmp(subsys, "NVLS") == 0) {
mask = NCCL_NVLS;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL;
}
+73 -79
View File
@@ -29,58 +29,55 @@ struct ncclKernelMatch {
typedef void(*ncclKern_t)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
// Must be consistent with the ncclFuncSet enum
static ncclKernelMatch const ncclKerns[4] = {
static ncclKernelMatch const ncclKerns[2] = {
{(void *)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true},
{(void *)NCCL_KERN_NAME_DEBUG(SendRecv, RING, SIMPLE, Sum, int8_t), true},
{(void *)NCCL_KERN_NAME_LL128(SendRecv, RING, SIMPLE, Sum, int8_t), true},
{(void *)NCCL_KERN_NAME_LL128_DEBUG(SendRecv, RING, SIMPLE, Sum, int8_t), true},
};
static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);
// Determine the maximum kernel stack size of all CUDA kernels
size_t ncclKernMaxLocalSize() {
ncclResult_t res = ncclSuccess;
int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
cudaFuncAttributes attr = {0};
size_t max = 0;
for (int i = 0; i < numNcclKerns; i++) {
if (ncclKerns[i].kernelFn != nullptr) {
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, reinterpret_cast<const void*>(ncclKerns[i].kernelFn)), res, error);
if (attr.localSizeBytes > max) max = attr.localSizeBytes;
NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
// Returns maximum kernel stack size of all CUDA kernels
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
ncclResult_t result = ncclSuccess;
if (maxStackSize) *maxStackSize = 0;
int carveout = ncclParamL1SharedMemoryCarveout();
// Keep track if we already visited a function pointer.
void* lru[2] = {nullptr, nullptr};
for (int i=0; i < KernelCount; i++) {
void* fn = ncclKerns[i].kernelFn;
if (fn == lru[0] || fn == lru[1]) goto next_kernel;
lru[1] = lru[0];
lru[0] = fn;
if (maxStackSize) {
cudaFuncAttributes attr = {0};
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
ignore0:;
}
if (carveout) {
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
cudaFuncAttributePreferredSharedMemoryCarveout, carveout),
result, ignore1);
ignore1:;
}
if (ncclShmemDynamicSize(cudaArch) != 0) {
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
result, next_kernel);
}
next_kernel:;
}
error:
return (res != ncclSuccess) ? 0 : max;
return result;
}
// Determine kernel stack size from index
size_t ncclKernLocalSize(int i) {
ncclResult_t res = ncclSuccess;
int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
cudaFuncAttributes attr = {0};
if (i < numNcclKerns)
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, reinterpret_cast<const void*>(ncclKerns[i].kernelFn)), res, error);
error:
return (res != ncclSuccess) ? 0 : attr.localSizeBytes;
}
// Set shared memory carveout for the nccl kernels
ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut) {
ncclResult_t res = ncclSuccess;
int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
for (int i = 0; i < numNcclKerns; i++) {
CUDACHECKGOTO(cudaFuncSetAttribute((const void *)ncclKerns[i].kernelFn, cudaFuncAttributePreferredSharedMemoryCarveout, carveOut), res, error);
}
error:
return res;
}
/*****************************************************************************/
/* Launch system : synchronization and CUDA kernel launch */
/*****************************************************************************/
@@ -211,10 +208,9 @@ static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelP
static ncclResult_t addCollToPlan(
struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex,
struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp,
int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[]
int nCollChannels, int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[]
) {
struct ncclKernelPlan::Channel *chans = plan->channels;
int nCollChannels = comm->nChannels;
// Choose the `nBid` least loaded channels to do the work. This ensures
// all bids go to different channels in case they need to synchronize.
@@ -231,9 +227,7 @@ static ncclResult_t addCollToPlan(
}
}
// Sort in the rest of the channels. If a channel has less work than the max
// member of least[], replace that member and compute the new max. The optimal
// algorithm uses a max-heap, but for our small sizes I suspect the better
// asymptotic complexity would be swamped by the increased instruction complexity.
// member of least[], replace that member and compute the new max.
for (int c=nBid; c < nCollChannels; c++) {
if (chans[c].collBytes < maxBytesInLeast) {
least[maxIndexInLeast] = c;
@@ -507,8 +501,9 @@ static ncclResult_t scheduleCollTasksToPlan(
info.sliceSteps = head->sliceSteps;
NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks));
if (nAggOps > 1) {
int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]);
info.nChannels = std::max(1, std::min(info.nChannels, comm->nChannels));
info.nChannels = std::max(1, std::min(info.nChannels, maxChannels));
info.algorithm = aggInfo.algorithm;
info.protocol = aggInfo.protocol;
info.nThreads = aggInfo.nThreads;
@@ -531,8 +526,9 @@ static ncclResult_t scheduleCollTasksToPlan(
NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, &regBufUsed, regBufSend, regBufRecv));
}
int maxChannels = info.algorithm == NCCL_ALGO_NVLS ? comm->nvlsChannels : comm->nChannels;
NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp,
info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
tasks->nTasksColl -= 1;
tasks->collBytesTotal -= info.nBytes;
ncclIntruQueueDequeue(&tasks->collQueue);
@@ -830,7 +826,7 @@ static void HIPRT_CB hostStreamPlanCallback(void *plan_) {
struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_;
ncclResult_t result = hostStreamPlanTask(plan->comm, plan);
if (result != ncclSuccess) {
WARN("hostStreamPlanCallback() failed : %s\n", ncclGetErrorString(result));
WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
}
}
@@ -943,7 +939,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
CUDACHECK(hipStreamWaitEvent(tasks->streams->stream, comm->doneEvent, 0));
}
if (persistent || comm->persistentRefs != 0) {
if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) {
// We have to launch host tasks to push proxy args. We are careful to only
// do this if necessary since host tasks impose a high performance cost in CUDA.
bool acquired = false;
@@ -984,12 +980,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
return ncclSuccess;
}
#if CUDART_VERSION >= 11080
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
#define NCCL_CGA_CLUSTER_SIZE_SM90 4
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", -2);
#endif
#if CUDART_VERSION >= 12000
// NCCL uses the "Remote" Mem Sync domain by default
NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
@@ -1001,6 +991,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
cudaStream_t launchStream = tasks->streams->stream;
dim3 grid = {(unsigned)plan->channelCount, 1, 1};
dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
size_t smem = ncclShmemDynamicSize(comm->cudaArch);
void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead};
if (tasks->numStreams == 1) {
CUDACHECK(hipExtLaunchKernel(plan->kernelFn, grid, block, args, 0, tasks->streams->stream, NULL, comm->doneEvent, 0));
@@ -1013,19 +1004,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
if (driverVersion >= 11080) {
int compCap = comm->compCap;
unsigned int clusterSize = (compCap == 90) ? NCCL_CGA_CLUSTER_SIZE_SM90 : 0;
if (ncclParamCGAClusterSize() != -2) {
clusterSize = ncclParamCGAClusterSize();
if (clusterSize > NCCL_MAX_CGA_CLUSTER_SIZE) {
static bool warned = false;
if (warned == false) {
WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.",
clusterSize, NCCL_MAX_CGA_CLUSTER_SIZE);
warned = true;
}
clusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
}
}
unsigned int clusterSize = (compCap == 90) ? comm->cgaClusterSize : 0;
cudaLaunchConfig_t launchConfig = {0};
cudaLaunchAttribute launchAttrs[3];
@@ -1057,6 +1036,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
#endif
launchConfig.gridDim = grid;
launchConfig.blockDim = block;
launchConfig.dynamicSmemBytes = smem;
launchConfig.attrs = launchAttrs;
launchConfig.numAttrs = attrs;
launchConfig.stream = launchStream;
@@ -1066,12 +1046,12 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
}
#endif
// Standard kernel launch
CUDACHECK(cudaLaunchKernel(fn, grid, block, args, 0, launchStream));
CUDACHECK(cudaLaunchKernel(fn, grid, block, args, smem, launchStream));
return ncclSuccess;
}
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
if (comm->persistentRefs == 0) { // implies !plan->persistent
if (!(plan->persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking)) {
// If this isn't being captured and there aren't any CUDA graphs alive
// then we don't need to do our proxyOp pushing on the host stream.
NCCLCHECK(hostStreamPlanTask(comm, plan));
@@ -1146,6 +1126,8 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
int nAlgos = NCCL_NUM_ALGORITHMS;
for (int a=0; a<nAlgos; a++) {
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
if (a == NCCL_ALGO_NVLS && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float time;
NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
@@ -1178,6 +1160,9 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
}
ncSwitch /= 2;
}
} else if (info->algorithm == NCCL_ALGO_NVLS) {
// NVLS should not need more than 16 channels to get peak BW.
nc = comm->nvlsChannels;
} else {
// Ring/Tree channel tuning
while (info->nBytes < nc*nt*threadThreshold) {
@@ -1198,6 +1183,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_NVLS) nt = NCCL_MAX_NTHREADS;
}
nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
#endif
@@ -1245,6 +1231,7 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
info->pattern = ncclPatternRing; break;
case ncclFuncAllReduce:
info->pattern =
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
@@ -1264,6 +1251,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
case ncclPatternPipelineFrom:
case ncclPatternPipelineTo:
case ncclPatternCollnetChain:
case ncclPatternNvls:
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
case ncclPatternCollnetDirect:
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break;
@@ -1292,13 +1280,6 @@ comp_next:
// Set nstepsPerLoop and nchunksPerLoop
NCCLCHECK(getPatternInfo(info));
NCCLCHECK(getLoopInfo(info));
if (info->comm->topo->pivotA2ANumBiRings == 3 ) {
if (ncclTypeSize(info->datatype)*info->count > 131072) {
work->pad_0 = 1;
} else {
work->pad_0 = 2;
}
}
work->sendbuff = info->sendbuff;
work->recvbuff = info->recvbuff;
work->root = info->root;
@@ -1359,6 +1340,14 @@ comp_next:
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*8 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_NVLS) {
if (chunkSize > 131072) chunkSize = 131072;
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
if ((info->nBytes < (8 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
if ((info->nBytes < (2 * (concurrentOps*chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->protocol == NCCL_PROTO_LL) {
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
@@ -1671,6 +1660,11 @@ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) {
WARN("ncclRedOpDestroy : operator is garbage.");
return ncclInvalidArgument;
}
if (comm == NULL) {
WARN("ncclRedOpDestroy : invalid communicator passed.");
return ncclInvalidArgument;
}
int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps);
if (comm->userRedOpCapacity <= ix || comm->userRedOps[ix].freeNext != -1) {
WARN("ncclRedOpDestroy : operator unknown to this communicator.");
-292
View File
@@ -5,25 +5,6 @@
* See LICENSE.txt for license information
************************************************************************/
/*
* Code for binary tree based on the same function available in Open MPI
* File: ompi/mca/coll/base/coll_base_topo.c
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
*/
#include "comm.h"
#include "graph.h"
#include "trees.h"
@@ -95,279 +76,6 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
return ncclSuccess;
}
static int calculate_level (int rank)
{
int level, num;
if( rank < 0 ) return -1;
for( level = 0, num = 0; num <= rank; level++ ) {
num += 1<<level;
}
return level-1;
}
static int calculate_num_nodes_up_to_level (int level)
{
return ((1<<level) - 1);
}
ncclResult_t ncclBinaryTreePostset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph) {
int nChannels = comm->nChannels;
int localRanks = 0;
for (int i=0; i<comm->topo->nodes[GPU].count; i++) {
localRanks += comm->topo->nodes[GPU].nodes[i].gpu.nRanksPerGpu;
}
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
// Only the first rank on a GPU can be a treeRoot
int treeRoot = comm->topo->nodes[GPU].nodes[c%comm->topo->nodes[GPU].count].gpu.rank[0];
channel->binTree.up = -1;
channel->binTree.down[0] = -1;
channel->binTree.down[1] = -1;
channel->binTree.down[2] = -1;
/*
* Shift all ranks by root, so that the algorithm can be
* designed as if root would be always 0
* shiftedrank should be used in calculating distances
* and position in tree
*/
int shiftedrank = comm->rank - treeRoot;
if (shiftedrank < 0 ) {
shiftedrank += localRanks;
}
/* calculate my level */
int level = calculate_level (shiftedrank);
int delta = 1<<level;
/* find my children */
for (int i = 0; i < 2; i++) {
int schild = shiftedrank + delta * (i+1);
if (schild < localRanks) {
channel->binTree.down[i] = (schild+treeRoot)%localRanks;
}
}
/* find my parent */
int slimit = calculate_num_nodes_up_to_level (level);
int sparent = shiftedrank;
if (sparent < 2) {
sparent = 0;
}
else {
while (sparent >= slimit) {
sparent -= delta/2;
}
}
if (comm->rank != treeRoot) {
channel->binTree.up = (sparent+treeRoot)%localRanks;
}
}
return ncclSuccess;
}
#define NUM_HAYABUSA_TREES 2
static bool hayabusa_tree_matrix_is_init=false;
static int hayabusa_tree_matrix[NUM_HAYABUSA_TREES][16][4];
static void hayabusa_tree_matrix_init()
{
if (hayabusa_tree_matrix_is_init)
return;
// index = rank of proc, child0, child1, child2, parent
// channel 0: root is 15
hayabusa_tree_matrix[0][0][0] = 1;
hayabusa_tree_matrix[0][0][1] = -1;
hayabusa_tree_matrix[0][0][2] = -1;
hayabusa_tree_matrix[0][0][3] = 4;
hayabusa_tree_matrix[0][1][0] = -1;
hayabusa_tree_matrix[0][1][1] = -1;
hayabusa_tree_matrix[0][1][2] = -1;
hayabusa_tree_matrix[0][1][3] = 0;
hayabusa_tree_matrix[0][2][0] = 3;
hayabusa_tree_matrix[0][2][1] = -1;
hayabusa_tree_matrix[0][2][2] = -1;
hayabusa_tree_matrix[0][2][3] = 6;
hayabusa_tree_matrix[0][3][0] = -1;
hayabusa_tree_matrix[0][3][1] = -1;
hayabusa_tree_matrix[0][3][2] = -1;
hayabusa_tree_matrix[0][3][3] = 2;
hayabusa_tree_matrix[0][4][0] = 0;
hayabusa_tree_matrix[0][4][1] = -1;
hayabusa_tree_matrix[0][4][2] = -1;
hayabusa_tree_matrix[0][4][3] = 5;
hayabusa_tree_matrix[0][5][0] = 4;
hayabusa_tree_matrix[0][5][1] = -1;
hayabusa_tree_matrix[0][5][2] = -1;
hayabusa_tree_matrix[0][5][3] = 14;
hayabusa_tree_matrix[0][6][0] = 2;
hayabusa_tree_matrix[0][6][1] = 7;
hayabusa_tree_matrix[0][6][2] = -1;
hayabusa_tree_matrix[0][6][3] = 14;
hayabusa_tree_matrix[0][7][0] = -1;
hayabusa_tree_matrix[0][7][1] = -1;
hayabusa_tree_matrix[0][7][2] = -1;
hayabusa_tree_matrix[0][7][3] = 6;
hayabusa_tree_matrix[0][8][0] = -1;
hayabusa_tree_matrix[0][8][1] = -1;
hayabusa_tree_matrix[0][8][2] = -1;
hayabusa_tree_matrix[0][8][3] = 9;
hayabusa_tree_matrix[0][9][0] = 13;
hayabusa_tree_matrix[0][9][1] = 8;
hayabusa_tree_matrix[0][9][2] = -1;
hayabusa_tree_matrix[0][9][3] = 11;
hayabusa_tree_matrix[0][10][0] = -1;
hayabusa_tree_matrix[0][10][1] = -1;
hayabusa_tree_matrix[0][10][2] = -1;
hayabusa_tree_matrix[0][10][3] = 11;
hayabusa_tree_matrix[0][11][0] = 9;
hayabusa_tree_matrix[0][11][1] = 10;
hayabusa_tree_matrix[0][11][2] = -1;
hayabusa_tree_matrix[0][11][3] = 15;
hayabusa_tree_matrix[0][12][0] = -1;
hayabusa_tree_matrix[0][12][1] = -1;
hayabusa_tree_matrix[0][12][2] = -1;
hayabusa_tree_matrix[0][12][3] = 13;
hayabusa_tree_matrix[0][13][0] = 12;
hayabusa_tree_matrix[0][13][1] = -1;
hayabusa_tree_matrix[0][13][2] = -1;
hayabusa_tree_matrix[0][13][3] = 9;
hayabusa_tree_matrix[0][14][0] = 5;
hayabusa_tree_matrix[0][14][1] = 6;
hayabusa_tree_matrix[0][14][2] = -1;
hayabusa_tree_matrix[0][14][3] = 15;
hayabusa_tree_matrix[0][15][0] = 14;
hayabusa_tree_matrix[0][15][1] = 11;
hayabusa_tree_matrix[0][15][2] = -1;
hayabusa_tree_matrix[0][15][3] = -1;
//Channel 1: root is 6
hayabusa_tree_matrix[1][0][0] = -1;
hayabusa_tree_matrix[1][0][1] = -1;
hayabusa_tree_matrix[1][0][2] = -1;
hayabusa_tree_matrix[1][0][3] = 1;
hayabusa_tree_matrix[1][1][0] = 5;
hayabusa_tree_matrix[1][1][1] = 0;
hayabusa_tree_matrix[1][1][2] = -1;
hayabusa_tree_matrix[1][1][3] = 3;
hayabusa_tree_matrix[1][2][0] = -1;
hayabusa_tree_matrix[1][2][1] = -1;
hayabusa_tree_matrix[1][2][2] = -1;
hayabusa_tree_matrix[1][2][3] = 3;
hayabusa_tree_matrix[1][3][0] = 1;
hayabusa_tree_matrix[1][3][1] = 2;
hayabusa_tree_matrix[1][3][2] = -1;
hayabusa_tree_matrix[1][3][3] = 7;
hayabusa_tree_matrix[1][4][0] = -1;
hayabusa_tree_matrix[1][4][1] = -1;
hayabusa_tree_matrix[1][4][2] = -1;
hayabusa_tree_matrix[1][4][3] = 5;
hayabusa_tree_matrix[1][5][0] = 4;
hayabusa_tree_matrix[1][5][1] = -1;
hayabusa_tree_matrix[1][5][2] = -1;
hayabusa_tree_matrix[1][5][3] = 1;
hayabusa_tree_matrix[1][6][0] = 7;
hayabusa_tree_matrix[1][6][1] = 13;
hayabusa_tree_matrix[1][6][2] = -1;
hayabusa_tree_matrix[1][6][3] = -1;
hayabusa_tree_matrix[1][7][0] = 3;
hayabusa_tree_matrix[1][7][1] = 15;
hayabusa_tree_matrix[1][7][2] = -1;
hayabusa_tree_matrix[1][7][3] = 6;
hayabusa_tree_matrix[1][8][0] = 9;
hayabusa_tree_matrix[1][8][1] = -1;
hayabusa_tree_matrix[1][8][2] = -1;
hayabusa_tree_matrix[1][8][3] = 12;
hayabusa_tree_matrix[1][9][0] = -1;
hayabusa_tree_matrix[1][9][1] = -1;
hayabusa_tree_matrix[1][9][2] = -1;
hayabusa_tree_matrix[1][9][3] = 8;
hayabusa_tree_matrix[1][10][0] = -1;
hayabusa_tree_matrix[1][10][1] = -1;
hayabusa_tree_matrix[1][10][2] = -1;
hayabusa_tree_matrix[1][10][3] = 11;
hayabusa_tree_matrix[1][11][0] = 10;
hayabusa_tree_matrix[1][11][1] = -1;
hayabusa_tree_matrix[1][11][2] = -1;
hayabusa_tree_matrix[1][11][3] = 15;
hayabusa_tree_matrix[1][12][0] = 8;
hayabusa_tree_matrix[1][12][1] = -1;
hayabusa_tree_matrix[1][12][2] = -1;
hayabusa_tree_matrix[1][12][3] = 13;
hayabusa_tree_matrix[1][13][0] = 12;
hayabusa_tree_matrix[1][13][1] = -1;
hayabusa_tree_matrix[1][13][2] = -1;
hayabusa_tree_matrix[1][13][3] = 6;
hayabusa_tree_matrix[1][14][0] = -1;
hayabusa_tree_matrix[1][14][1] = -1;
hayabusa_tree_matrix[1][14][2] = -1;
hayabusa_tree_matrix[1][14][3] = 15;
hayabusa_tree_matrix[1][15][0] = 11;
hayabusa_tree_matrix[1][15][1] = 14;
hayabusa_tree_matrix[1][15][2] = -1;
hayabusa_tree_matrix[1][15][3] = 7;
hayabusa_tree_matrix_is_init = true;
}
static void set_channel_info(int c, int rank, struct ncclChannel *channel)
{
channel->binTree.down[0] = hayabusa_tree_matrix[c%NUM_HAYABUSA_TREES][rank][0];
channel->binTree.down[1] = hayabusa_tree_matrix[c%NUM_HAYABUSA_TREES][rank][1];
channel->binTree.down[2] = hayabusa_tree_matrix[c%NUM_HAYABUSA_TREES][rank][2];
channel->binTree.up = hayabusa_tree_matrix[c%NUM_HAYABUSA_TREES][rank][3];
}
ncclResult_t ncclBinaryTreeHayabusaPostset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph) {
int nChannels = comm->nChannels;
hayabusa_tree_matrix_init();
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
set_channel_info(c, comm->localRank, channel);
}
return ncclSuccess;
}
ncclResult_t ncclTreeBasePostset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph) {
int nChannels = comm->nChannels;
+2 -2
View File
@@ -486,7 +486,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank
type = node->type;
}
if (type != GPU) {
WARN("Could not find intermediate GPU between GPU rank %d and NIC %d\n", rank, netDev);
WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
return ncclInternalError;
}
*intermediateRank = node->gpu.rank[0];
@@ -802,6 +802,7 @@ static int nextPow2(int v) {
}
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
/* here we already honor comm->max/minCTAs for p2pnChannels. */
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
int minChannels = comm->p2pnChannels;
@@ -842,7 +843,6 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
comm->p2pChannels[c] = mirror;
}
INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
return ncclSuccess;
}
+1 -2
View File
@@ -896,7 +896,6 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
}
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
// SPLIT_TREE works better on older archs.
int ccMin;
NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
@@ -1177,6 +1176,7 @@ ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, str
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter, int nInter, int *inter) {
int interGpus[MAX_XGMI_INTER_GPUS+1];
int ngpus = system->nodes[GPU].count;
*isXGMI = false;
// check for direct XGMI connection
for (int i=0; i<ngpus; i++) {
if (system->nodes[GPU].nodes[i].gpu.dev == cudaDev1) {
@@ -1231,6 +1231,5 @@ ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, in
}
}
}
*isXGMI = false;
return ncclSuccess;
}
+1 -1
View File
@@ -917,6 +917,6 @@ ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int*
}
}
}
WARN("Could not find local GPU with rank %d\n", rank);
WARN("Could not find local GPU with rank %d", rank);
return ncclInternalError;
}
+1 -1
View File
@@ -225,6 +225,6 @@ static float ncclTopoXGMISpeed(int gcn) {
}
#define ncclGetKernelIndex(p_comm) \
(((p_comm)->topo->ll128Enabled ? 1 : 0)*2 + ((p_comm)->collTraceThread ? 1 : 0))
((p_comm)->collTraceThread ? 1 : 0)
#endif
+49 -38
View File
@@ -54,7 +54,7 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 } };
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 } };
// NVLink, PCI, Network
#define NCCL_HW_NVLINK 0
@@ -71,18 +71,18 @@ struct tuningModel {
static struct tuningModel tuning_model_0 {
.hwLat = {
/* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 1.4 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 1.4 }, /* NVLS */ { 0, 0, 0 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 } },
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 11.8, 18.2, 20.8 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 19.8, 15.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 11.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 18.2 } },
{ /* Tree (LL/LL128/Simple)*/ { 11.8, 18.2, 20.8 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 19.8, 15.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 11.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 18.2 }, /* NVLS */ { 0, 0, 0 } },
},
.bwRatio = {
/* 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.91 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.91 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
/* more than 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.95 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.95 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
},
.treeCorrectionFactor = {
@@ -101,18 +101,18 @@ static struct tuningModel tuning_model_0 {
static struct tuningModel tuning_model_1 {
.hwLat =
{ /* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 } },
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 } },
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 33.0, 33.0, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 5.1, 5.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 } },
{ /* Tree (LL/LL128/Simple)*/ { 33.0, 33.0, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 5.1, 5.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 } },
},
.bwRatio =
{ /* 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.12, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.12, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.12, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.12, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
/* more than 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
},
.treeCorrectionFactor = {
@@ -131,18 +131,18 @@ static struct tuningModel tuning_model_1 {
static struct tuningModel tuning_model_2 {
.hwLat = {
/* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 } },
{ /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 }, /* NVLS */ { 0, 0, 0 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 } },
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 27.9, 27.9, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 12.1, 12.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 } },
{ /* Tree (LL/LL128/Simple)*/ { 27.9, 27.9, 15.8 }, /* Ring (LL/LL128/Simple)*/ { 12.1, 12.1, 68.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 15.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 15.8 }, /* NVLS */ { 0, 0, 0 } },
},
.bwRatio = {
/* 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
/* more than 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.07, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
},
.treeCorrectionFactor = {
@@ -161,18 +161,18 @@ static struct tuningModel tuning_model_2 {
static struct tuningModel tuning_model_3 {
.hwLat = {
/* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 } },
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 } },
{ /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 } },
},
.bwRatio = {
/* 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 1.75 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 1.75 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
/* more than 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 0.96 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 0.96 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
},
.treeCorrectionFactor = {
@@ -193,16 +193,16 @@ static struct tuningModel tuning_model_4 {
/* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.8, 1.4, 2.5 }, /* CollNetChain (Simple)*/ { 0.8, 1.4, 2.5 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 } },
{ /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 32.2, 34.4, 47.6 }, /* Ring (LL/LL128/Simple)*/ { 35.4, 87.8, 209.2 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 47.6 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 47.6 } },
},
.bwRatio = {
/* 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.61 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.61 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
/* more than 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.08 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } },
{ /* Tree (LL/LL128/Simple)*/ { 0.16, 1.09, 1.08 }, /* Ring (LL/LL128/Simple)*/ { 0.15, 0.41, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 } },
},
.treeCorrectionFactor = {
@@ -232,7 +232,7 @@ static struct tuningModel rcclTuningModel[] = {
#define HOPPER_COMPCAP_IDX 2
// LL128 max BW per channel
static const double ll128MaxBwPerCh = 20.0;
static const double ll128MaxBwPerCh[3] = { 20.0, 20.0, 36.7 };
static const double llMaxBws[3][3] = {
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
@@ -242,7 +242,7 @@ static const double llMaxBws[3][3] = {
static const double perChMaxTreeBws[3][3] = {
/* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
/* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
/* Hopper (N1/N2/N4) */ {24.0, 23.6, 17.8},
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 33.0},
};
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
@@ -261,7 +261,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] =
comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
@@ -272,7 +273,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
int nRanks = comm->nRanks;
if (nRanks <= 1) return ncclSuccess;
int compCapIndex = (minCompCap == 80 && maxCompCap == 80) ? AMPERE_COMPCAP_IDX : ((minCompCap == 90 && maxCompCap == 90) ? HOPPER_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
int cpuArch, cpuVendor, cpuModel;
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
int index2 = nNodes <= 2 ? nNodes-1 : 2;
@@ -284,7 +285,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
//if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph };
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph, collNetGraph, ringGraph/* we only need the NVSwitch speed for NVLS*/ };
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
@@ -301,6 +302,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if ((coll != ncclFuncAllReduce) && a != NCCL_ALGO_RING) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (a == NCCL_ALGO_NVLS && p != NCCL_PROTO_SIMPLE) continue;
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw;
@@ -314,11 +316,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p];
#else
if (compCapIndex == AMPERE_COMPCAP_IDX) busBw = std::min(busBw, 235.0f);
if (compCapIndex == HOPPER_COMPCAP_IDX) busBw = std::min(busBw, 370.0f);
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh*graphs[a]->nChannels);
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh*graphs[a]->nChannels);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh[compCapIndex]*graphs[a]->nChannels);
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
@@ -331,7 +334,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (a == NCCL_ALGO_COLLNET_CHAIN && p == NCCL_PROTO_SIMPLE) busBw *= .75;
// Convert bus BW to algorithm BW
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
float ratio;
if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
else if (a == NCCL_ALGO_NVLS) ratio = .75;
else ratio = .5;
comm->bandwidths[coll][a][p] = busBw * ratio;
comm->latencies[coll][a][p] = baseLat[a][p];
@@ -366,7 +372,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Protocols/Algorithms enable/disable, and user overrides.
// All are enabled except ll128 which is enabled by default only in certain cases.
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1 };
const char *protoStr = getenv("NCCL_PROTO");
if (protoStr) {
@@ -378,6 +384,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
}
// Disable NVLink SHARP if not supported
if (comm->nvlsSupport == 0 /* || comm->localRanks <= 2*/) algoEnable[NCCL_ALGO_NVLS] = 0;
// Disable CollNet if it is not supported
if (comm->collNetSupport == 0) {
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
@@ -404,7 +414,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
#else
// Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
pEnable = 1;
pEnable &= (graphs[a]->typeInter <= PATH_PXB);
pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN));
pEnable &= (graphs[a]->typeIntra <= PATH_NVL);
pEnable &= (minCompCap == maxCompCap);
switch (minCompCap) {
@@ -416,8 +426,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
#endif
}
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
// Only disable algo for Allreduce since others only have one
if (c == ncclFuncAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
// Never disable ring for non-allreduce operations. That allows to run real apps with NCCL_ALGO=TREE.
if (a == NCCL_ALGO_RING && c != ncclFuncAllReduce) continue;
if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
}
if (comm->rank == 0) {
@@ -461,9 +472,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
char* str = getenv("NCCL_THREAD_THRESHOLDS");
if (str) {
INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }, { -2, -2, -2 }};
ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }};
sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
for (int a=0; a<2; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p];
}
+1 -2
View File
@@ -328,7 +328,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
ret = ncclSystemError;
}
job->state = ncclGroupJobJoined;
if (job->result != ncclSuccess) {
if (job->result != ncclSuccess && ret == ncclSuccess) {
ret = job->result;
errorJobAbortFlag = true;
}
@@ -339,7 +339,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
if (*groupAbortFlag == true || errorJobAbortFlag == true) {
*job->abortFlag = 1;
ret = ncclInternalError;
}
job = job->next;
+1
View File
@@ -25,6 +25,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
ncclResult_t bootstrapClose(void* commState);
ncclResult_t bootstrapAbort(void* commState);
#endif
+13 -13
View File
@@ -35,12 +35,6 @@ struct ncclDevRedOpFull {
#define NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type) \
ncclKernelDebug_##func##_##algo##_##proto##_##devredop##_##type
#define NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type) \
ncclKernelLL128_##func##_##algo##_##proto##_##devredop##_##type
#define NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type) \
ncclKernelLL128Debug_##func##_##algo##_##proto##_##devredop##_##type
#define NCCL_IMPL_NAME(func, algo, proto) \
nccl##func##algo##proto
@@ -49,16 +43,12 @@ struct ncclDevRedOpFull {
#define DECL5(func, algo, proto, devredop, type) \
extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
#else
#define DECL5(func, algo, proto, devredop, type) \
extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
#endif
#define SINGLE_ARG(...) __VA_ARGS__
@@ -76,7 +66,8 @@ struct ncclDevRedOpFull {
DECL4(func, RING, devredop, type, undef) \
DECL4(func, TREE, devredop, type, undef) \
DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
DECL4(func, COLLNET_CHAIN, devredop, type, undef)
DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
DECL4(func, NVLS, devredop, type, undef)
#if defined(RCCL_BFLOAT16)
#define DECL2(func, devredop, undefForFloat) \
@@ -147,4 +138,13 @@ extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
#define ALLTOALL_PIVOT_SLICESTEPS 2
#define ALLTOALL_PIVOT_CHUNKSTEPS 4
// We can't use the enum identifiers like ncclSum, ncclFloat, etc since this
// macro will be used in preprocessor conditionals where enums have no meaning.
#define NCCL_NVLS_SUPPORTS(/*ncclDataType_t*/ type, /*ncclDevRedOp_t*/ red) \
(((type==2 || type==3) && (red==0 || red==2 || red==3)) || \
((type==4 || type==5) && (red==0 || red==2 || red==3)) || \
((type==6 || type==9) && (red==0 || red==2 || red==3)) || \
(type==7 && red==0) || \
(type==8 && red==0))
#endif
+13
View File
@@ -110,6 +110,7 @@ struct ncclChannel {
struct ncclTree collnetChain;
struct ncclDirect collnetDirect;
struct ncclTree binTree;
struct ncclNvls nvls;
int id; // index of this channel
uint32_t workFifoSent; // last used work index+1
uint64_t p2pOpCount;
@@ -183,10 +184,12 @@ struct ncclComm {
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
int compCap; // compute capability of the GPU
int minCompCap; // min compute capability in the communicator
int64_t busId; // my PCI bus ID in int format
cpu_set_t cpuAffinity; // CPU affinity of the GPU
int WarpSize;
int virtualId;
int cudaArch; // matches __CUDA_ARCH__ of device
int node;
int nNodes;
@@ -209,6 +212,7 @@ struct ncclComm {
// Channels for collectives
int nChannels;
int nvlsChannels;
// Channels (per peer) for p2p
int p2pnChannels;
int p2pnChannelsPerPeer;
@@ -270,6 +274,10 @@ struct ncclComm {
int collNetSupport;
int intraHighestTransportType;
// NVLink SHARP (NVLS) support
int nvlsSupport;
void* nvlsResources;
size_t channelSize; // User requested work size (bytes) for channel partitions
// Internal streams
@@ -313,6 +321,11 @@ struct ncclComm {
// communicator mode
int blocking;
// CGA cluster size
int cgaClusterSize;
int minCTAs, maxCTAs;
// network interface name
char *netName;
// initState is to more conveniently reclaim resources when errors happen.
ncclResult_t initState;
// flag to indicate if ncclCommFinalize() is called
+23 -1
View File
@@ -73,10 +73,32 @@ DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
// cuMem API support
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
#if CUDA_VERSION >= 11070
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
#endif
#if CUDA_VERSION >= 12010
/* NVSwitch Multicast support */
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
#endif
#endif
/* CUDA Driver functions loaded with dlsym() */
@@ -88,6 +110,7 @@ DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
ncclResult_t ncclCudaLibraryInit(void);
extern int ncclCudaDriverVersionCache;
extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
inline ncclResult_t ncclCudaDriverVersion(int* driver) {
int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
@@ -98,5 +121,4 @@ inline ncclResult_t ncclCudaDriverVersion(int* driver) {
*driver = version;
return ncclSuccess;
}
#endif
+78 -5
View File
@@ -21,11 +21,12 @@
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2];
#define NCCL_NUM_ALGORITHMS 4 // Tree/Ring/CollNet*
#define NCCL_NUM_ALGORITHMS 5 // Tree/Ring/CollNet*
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
#define NCCL_ALGO_COLLNET_DIRECT 2
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
@@ -88,6 +89,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
#define NCCL_DIRECT_NIC 0x04
#define NCCL_IPC_WRITE 0x08
#define NCCL_IPC_READ 0x10
#define NCCL_NVLS_MIN_POLL 0x20
struct ncclConnInfo {
// Regular comm mechanism
@@ -95,7 +97,7 @@ struct ncclConnInfo {
uint64_t *tail; // Local for recv, remote for send
uint64_t *head; // Local for send, remote for recv
int direct; // Direct communication
int flags; // Direct communication / other flags
int shared; // Buffers are shared
void **ptrExchange; // Pointer exchange for direct communication
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
@@ -154,14 +156,23 @@ struct ncclTree {
struct ncclDirect {
int depth;
int out;
int nHeads;
int headRank;
int shift;
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
int up[NCCL_MAX_DIRECT_ARITY];
int down[NCCL_MAX_DIRECT_ARITY];
};
#define NCCL_CONN_IDX_P2P_NET 2
#define NCCL_MAX_NVLS_ARITY 8
struct ncclNvls {
int out;
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int up[NCCL_MAX_NVLS_ARITY];
int down;
};
#define NCCL_MAX_CONNS 3
struct ncclChannelPeer {
struct ncclConnector send[NCCL_MAX_CONNS];
@@ -361,6 +372,7 @@ struct alignas(16) ncclDevChannel {
struct ncclTree collnetChain;
struct ncclDirect collnetDirect;
struct ncclTree binTree;
struct ncclNvls nvls;
uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
};
@@ -399,4 +411,65 @@ struct alignas(16) ncclDevCommAndChannels {
struct ncclDevChannel channels[MAXCHANNELS];
};
#ifdef __CUDA_ARCH__
#define NCCL_CUDA_ARCH __CUDA_ARCH__
#else
#define NCCL_CUDA_ARCH 0
#endif
template<typename T>
__host__ __device__ constexpr T min_constexpr(T a) { return a; }
template<typename T, typename ...Ts>
__host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) {
return min_constexpr<T>((a < b ? a : b), c...);
}
template<typename T>
__host__ __device__ constexpr T max_constexpr(T a) { return a; }
template<typename T, typename ...Ts>
__host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
return max_constexpr<T>((a > b ? a : b), c...);
}
// Calculate the unroll factor given:
// * bytePerPack: number of bytes accessed per instruction
// * insns: max permissible unroll value
// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
__host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) {
return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack);
}
// Note that all unroll value logic should depend on a given cudaArch argument
// and not __CUDA_ARCH__ since these need to be host-side executable where the
// arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device
// side code can elide passing the arch for brevity.
__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
// Our collective unroll should move to the same bytes&insns model as NVLS.
return cudaArch >= 800 ? 8 : 4;
}
__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
__host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; }
__host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) {
return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch));
}
// The amount of dynamic shmem per warp
__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
return (max_constexpr<int>(
/*LL */0,
/*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
/*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16,
// NVLS needs an extra 16B to read unaligned data.
/*NVLS */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
) + 15) & -16; // pad to 16 bytes
}
// The amount of dynamic shmem per block
__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
}
#endif
+1 -3
View File
@@ -15,9 +15,7 @@
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
size_t ncclKernMaxLocalSize();
size_t ncclKernLocalSize(int i);
ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
-3
View File
@@ -119,9 +119,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph);
ncclResult_t ncclBinaryTreePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph);
ncclResult_t ncclBinaryTreeHayabusaPostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph);
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
#include "info.h"
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
+1
View File
@@ -25,6 +25,7 @@ typedef enum : uint8_t {
ncclPatternTreeUpDown,
ncclPatternCollnetChain,
ncclPatternCollnetDirect,
ncclPatternNvls,
ncclPatternSend,
ncclPatternRecv
} ncclPattern_t;
+37
View File
@@ -0,0 +1,37 @@
/*
* Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
*
* See COPYRIGHT for license information
*/
#ifndef NCCL_IPCSOCKET_H
#define NCCL_IPCSOCKET_H
#include "nccl.h"
#include <stdio.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <memory.h>
#include <sys/un.h>
#include <inttypes.h>
#define NCCL_IPC_SOCKNAME_LEN 64
struct ncclIpcSocket {
int fd;
char socketName[NCCL_IPC_SOCKNAME_LEN];
volatile uint32_t* abortFlag;
};
ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
#endif /* NCCL_IPCSOCKET_H */
+1 -1
View File
@@ -20,7 +20,7 @@
#define NCCL_NET_MAX_REQUESTS 8
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+8 -8
View File
@@ -7,12 +7,12 @@
#ifndef NCCL_NVTX_H_
#define NCCL_NVTX_H_
#include "nvtx3.hpp"
#include "nvtx3/nvtx3.hpp"
#if __cpp_constexpr >= 201304L && !defined(NVTX3_RELAXED_CONSTEXPR)
#define NVTX3_RELAXED_CONSTEXPR constexpr
#if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14)
#define NVTX3_CONSTEXPR_IF_CPP14 constexpr
#else
#define NVTX3_RELAXED_CONSTEXPR
#define NVTX3_CONSTEXPR_IF_CPP14
#endif
// Define all NCCL-provided static schema IDs here (avoid duplicates).
@@ -37,7 +37,7 @@ struct nccl_domain{static constexpr char const* name{"NCCL"};};
class payload_schema {
public:
NVTX3_RELAXED_CONSTEXPR explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
{
schema_attr.name = schemaName;
schema_attr.entries = entries;
@@ -74,11 +74,11 @@ class payload_schema {
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
static const payload_schema schema{S, std::extent<decltype(S)>::value, \
NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
static ::nvtx3::v1::registered_string<nccl_domain> const nvtx3_func_name__{__func__}; \
static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
nvtxPayloadData_t nvtx3_bpl__[] = { \
{NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
::nvtx3::v1::event_attributes nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
::nvtx3::v1::domain_thread_range<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
extern void initNvtxRegisteredEnums();
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
+2 -2
View File
@@ -1,12 +1,12 @@
/*
* Copyright 2021 NVIDIA Corporation. All rights reserved.
* Copyright 2021-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvtx3/nvToolsExt.h"
#include "nvToolsExt.h"
#ifndef NVTOOLSEXT_PAYLOAD_H
#define NVTOOLSEXT_PAYLOAD_H
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
File diff suppressed because it is too large Load Diff
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
@@ -35,10 +35,11 @@ NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
{
intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
nvtxExtModuleSegment_t segment = {
0, // unused (only one segment)
NVTX3EXT_CBID_PAYLOAD_FN_NUM,
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1
fnSlots
};
nvtxExtModuleInfo_t module = {
+40 -2
View File
@@ -12,6 +12,7 @@
#include "devcomm.h"
#include "info.h"
#include "socket.h"
#include "ipcsocket.h"
#include <pthread.h>
#include "shm.h"
@@ -171,6 +172,31 @@ struct ncclProxyProgressState {
int nextOps;
};
// Expected proxy response fifo
struct ncclExpectedProxyResponse {
void* opId;
int respSize;
bool done;
void* respBuff;
struct ncclExpectedProxyResponse* next;
};
struct ncclProxyAsyncOp {
int type;
struct ncclProxyConnection* connection;
int reqSize, respSize;
char *reqBuff, *respBuff;
void* opId;
ncclProxyAsyncOp* next;
};
struct ncclProxyLocalPeer {
struct ncclSocket sock;
int localRank;
ncclProxyAsyncOp* asyncOps;
int asyncOpCounter;
};
struct ncclProxyState {
// Service thread
pthread_t thread;
@@ -186,6 +212,9 @@ struct ncclProxyState {
// Progress thread
struct ncclProxyProgressState progressState;
// Queue of expected responses from the proxy
struct ncclExpectedProxyResponse* expectedResponses;
};
enum proxyConnectState {
@@ -230,10 +259,19 @@ enum ncclProxyMsgType {
ncclProxyMsgStart = 5,
ncclProxyMsgClose = 6,
ncclProxyMsgAbort = 7,
ncclProxyMsgStop = 8
ncclProxyMsgStop = 8,
ncclProxyMsgConvertFd = 9 // cuMem API support
};
ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
+2
View File
@@ -70,4 +70,6 @@ DECLARE_ROCM_PFN_EXTERN(hsa_status_string);
ncclResult_t rocmLibraryInit(void);
extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
#endif
+1 -1
View File
@@ -92,6 +92,6 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed);
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
ncclResult_t ncclSocketClose(struct ncclSocket* sock);
#endif
+4 -1
View File
@@ -65,7 +65,7 @@ struct ncclTransportComm {
};
struct ncclTransport {
const char name[4];
const char name[8];
ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
struct ncclTransportComm send;
struct ncclTransportComm recv;
@@ -74,6 +74,9 @@ struct ncclTransport {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm);
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
enum { collNetRecv=0, collNetSend=1 };
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
+182 -83
View File
@@ -53,7 +53,7 @@
#endif
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAllPivot" };
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain" };
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS" };
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
const char* ncclDevRedOpStr[ncclNumDevRedOps] = { "Sum", "Prod", "Max", "Min", "PreMulSum", "SumPostDiv" };
const char *ncclTypeStr[ncclNumTypes] = {"_i8", "_u8", "_i32", "_u32", "_i64", "_u64", "_f16", "_f32", "_f64", "_b16"};
@@ -61,7 +61,7 @@ const char *ncclTypeStr[ncclNumTypes] = {"_i8", "_u8", "_i32", "_u32", "_i64", "
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
NCCL_PARAM(CommBlocking, "COMM_BLOCKING", 0);
NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT);
struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
@@ -89,12 +89,8 @@ ncclResult_t initGdrCopy() {
return ncclSuccess;
}
NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
static bool initialized = false;
static size_t maxLocalSizeBytes = 0;
static ncclResult_t ncclInit() {
if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess;
@@ -102,9 +98,6 @@ static ncclResult_t ncclInit() {
if (!initialized) {
initEnv();
initGdrCopy();
maxLocalSizeBytes = ncclKernMaxLocalSize();
int carveout = ncclParamL1SharedMemoryCarveout();
if (carveout) ncclKernSetSharedMemoryCarveout(carveout);
// Always initialize bootstrap network
NCCLCHECK(bootstrapNetInit());
NCCLCHECK(ncclNetPluginInit());
@@ -380,6 +373,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
NCCLCHECK(ncclStrongStreamDestruct(&comm->deviceStream));
}
if (comm->nvlsSupport) NCCLCHECK(ncclNvlsFree(comm));
struct ncclDestructor* dtor = comm->destructorHead;
while (dtor != nullptr) {
NCCLCHECK(dtor->fn(dtor));
@@ -391,6 +386,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
ncclMemoryStackDestruct(&comm->memPermanent);
ncclCudaHostFree((void *)comm->abortFlag);
free(comm->netName);
commPoison(comm); // poison comm before free to avoid comm reuse.
free(comm);
@@ -418,8 +414,8 @@ static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
int flag = 0;
CUdevice dev;
int cudaDriverVersion;
CUCHECK(cuDriverGetVersion(&cudaDriverVersion));
if (cudaDriverVersion < 11070) return ncclInternalError;
CUDACHECK(cudaDriverGetVersion(&cudaDriverVersion));
if (CUPFN(cuDeviceGet) == NULL || cudaDriverVersion < 11070) return ncclInternalError;
CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
// Query device to see if DMA-BUF support is available
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev));
@@ -442,7 +438,7 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
if (ret != ncclSuccess) {
/* if ret is not ncclInProgress, we just keep it. */
WARN("Attempt to use communicator before the previous operation returned ncclSuccess\n");
WARN("Attempt to use communicator before the previous operation returned ncclSuccess");
if (ret == ncclInProgress) ret = ncclInvalidArgument;
goto exit;
}
@@ -596,6 +592,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain;
tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect;
tmpCommAndChans.channels[c].binTree = comm->channels[c].binTree;
tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls;
tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c];
if (comm->channels[c].ring.userRanks != nullptr) {
@@ -759,8 +756,8 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collN
struct ncclChannel* channel = comm->channels + c;
for (int h = 0; h < nHeads; h++) {
const int head = heads[h];
collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
if (!collNetSetupFail) collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
}
// Verify CollNet setup across ranks after trying the first channel
if (c == 0) {
@@ -1218,39 +1215,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc), ret, fail);
if (comm->topo->pivotA2ANumBiRings == 3) {
NCCLCHECK(ncclTreeBasePostset(comm, &treeGraph));
if (comm->virtualId == -1) {
NCCLCHECK(ncclBinaryTreeHayabusaPostset(comm, &treeGraph));
} else {
NCCLCHECK(ncclBinaryTreePostset(comm, &treeGraph));
}
}
if (comm->topo->pivotA2ANumBiRings == 3) NCCLCHECK(ncclTreeBasePostset(comm, &treeGraph));
// AllGather3 - end
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
char line[1024], binline[1024];
char line[1024];
line[0]='\0';
binline[0]='\0';
for (int c=0; c<comm->nChannels; c++) {
struct ncclTree* tree = &comm->channels[c].tree;
struct ncclTree* binTree = &comm->channels[c].binTree;
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d",
c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up);
if (comm->topo->pivotA2ANumBiRings == 3)
snprintf(binline+strlen(binline), 1023-strlen(binline), " [%d] %d/%d/%d->%d->%d",
c, binTree->down[0], binTree->down[1], binTree->down[2], rank, binTree->up);
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d comm %p nRanks %02d busId %lx", c, comm->channels[c].ring.prev,
comm->rank, comm->channels[c].ring.next, comm, comm->nRanks, comm->busId);
}
line[1023] = '\0';
INFO(NCCL_INIT, "Trees%s comm %p nRanks %02d busId %lx", line, comm, comm->nRanks, comm->busId);
if (comm->topo->pivotA2ANumBiRings == 3) {
binline[1023] = '\0';
INFO(NCCL_INIT, "BinTrees%s comm %p nRanks %02d busId %lx", binline, comm, comm->nRanks, comm->busId);
}
NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
@@ -1280,11 +1261,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail);
// RCCL: need to connect binTree as well
if (comm->topo->pivotA2ANumBiRings == 3) {
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->binTree.down, 1, &channel->binTree.up, 0), ret, fail);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->binTree.up, NCCL_MAX_TREE_ARITY, channel->binTree.down, 0), ret, fail);
}
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail);
INFO(NCCL_INIT, "Connected all trees");
@@ -1292,6 +1268,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
// Check if we can setup CollNet
if (comm->collNetSupport > 0) collNetTrySetup(comm, &collNetGraph);
//NCCLCHECKGOTO(ncclNvlsSetup(comm), ret, fail);
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
// Compute time models for algorithm and protocol combinations
@@ -1299,7 +1277,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int myCompCap = comm->peerInfo[rank].cudaCompCap;
int minCompCap = myCompCap, maxCompCap = myCompCap;
for (int i = 0; i < nranks; i++) {
minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
comm->minCompCap = minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
}
NCCLCHECKGOTO(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph), ret, fail);
@@ -1308,6 +1286,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
// Compute nChannels per peer for p2p
NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
INFO(NCCL_INIT, "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
do { // Setup p2p structures in comm->tasks
struct ncclTasks* tasks = &comm->tasks;
int nRanks = comm->nRanks;
@@ -1374,12 +1354,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
}
}
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
}
// Connect to local net proxy
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
NCCLCHECKGOTO(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
// Then to remote ones when using PXN
if (ncclPxnDisable(comm) == 0) {
@@ -1387,7 +1368,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
for (int r=0; r<nranks; r++) {
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
NCCLCHECKGOTO(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
}
}
@@ -1441,6 +1422,11 @@ RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 512);
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 0);
#endif
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", NCCL_CONFIG_UNDEF_INT);
// Match config max/minCTAs
NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT);
NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
struct ncclCommInitRankAsyncJob {
struct ncclAsyncJob base;
@@ -1465,10 +1451,17 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
int myrank = job->myrank;
int cudaDev = job->cudaDev;
int virtualId = job->virtualId;
int archMajor, archMinor;
size_t maxLocalSizeBytes = 0;
ncclResult_t res = ncclSuccess;
int64_t stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
CUDACHECK(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
CUDACHECK(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev));
comm->cudaArch = 100*archMajor + 10*archMinor;
NCCLCHECK(ncclInitKernelsForDevice(comm->cudaArch, &maxLocalSizeBytes));
// Set the maximum kernel stack size of all kernels to avoid
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
#ifdef USE_INDIRECT_FUNCTION_CALL
@@ -1487,7 +1480,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)",
*newcomm, nranks, (unsigned long long)hashUniqueId(commId), myrank, (*newcomm)->cudaDev);
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx localSize %ld used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, ncclKernLocalSize(ncclGetKernelIndex(*newcomm)), allocTracker[(*newcomm)->cudaDev].totalAllocSize);
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx localSize %zi used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, maxLocalSizeBytes, allocTracker[(*newcomm)->cudaDev].totalAllocSize);
exit:
return res;
fail:
@@ -1495,18 +1488,143 @@ fail:
goto exit;
}
static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
ncclResult_t ret = ncclSuccess;
/* first set configuration */
if (config) {
comm->blocking = config->blocking;
} else {
/* default setting of communicator */
comm->blocking = 1;
#define NCCL_CONFIG_DEFAULT(config, field, undef, defvalue, fieldStr, format) \
if (config->field == undef) { \
config->field = defvalue; \
} else { \
INFO(NCCL_ENV, "Comm config " fieldStr " set to " format, config->field); \
}
static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
ncclResult_t ret = ncclSuccess;
/* config must not be NULL in this function */
int blockingEnv;
int cgaClusterSizeEnv;
int minCTAsEnv;
int maxCTAsEnv;
const char *envNetName, *tmpNetName;
ncclConfig_t defaultConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t *internalConfigPtr;
size_t realSize;
internalConfigPtr = &internalConfig;
if (config) {
memcpy((void*)&realSize, (void*)config, sizeof(size_t));
realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize;
memcpy((void*)internalConfigPtr, (void*)config, realSize);
if (internalConfigPtr->magic != 0xcafebeef) {
WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER");
ret = ncclInvalidArgument;
goto fail;
}
/* check version. */
if (internalConfigPtr->version < NCCL_VERSION(2, 14, 0)) {
internalConfigPtr->blocking = defaultConfig.blocking;
}
if (internalConfigPtr->version < NCCL_VERSION(2, 17, 0)) {
internalConfigPtr->cgaClusterSize = defaultConfig.cgaClusterSize;
internalConfigPtr->minCTAs = defaultConfig.minCTAs;
internalConfigPtr->maxCTAs = defaultConfig.maxCTAs;
internalConfigPtr->netName = defaultConfig.netName;
}
}
/* check input config attributes, -1 means user-undefined and we should use default value from NCCL. */
if (internalConfigPtr->blocking != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) {
WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking);
ret = ncclInvalidArgument;
goto fail;
}
if (internalConfigPtr->cgaClusterSize != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->cgaClusterSize < 0) {
WARN("Invalid config cgaClusterSize attribute value %d", internalConfigPtr->cgaClusterSize);
ret = ncclInvalidArgument;
goto fail;
}
if ((internalConfigPtr->minCTAs != NCCL_CONFIG_UNDEF_INT &&
internalConfigPtr->minCTAs <= 0) ||
(internalConfigPtr->maxCTAs != NCCL_CONFIG_UNDEF_INT &&
internalConfigPtr->maxCTAs <= 0) ||
(internalConfigPtr->minCTAs > internalConfigPtr->maxCTAs)) {
WARN("Invalid config min/max channels attribute value %d/%d", internalConfigPtr->minCTAs, internalConfigPtr->maxCTAs);
ret = ncclInvalidArgument;
goto fail;
}
/* default config value can be tuned on different platform. */
NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, minCTAs, NCCL_CONFIG_UNDEF_INT, 1, "Min CTAs", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d");
NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
tmpNetName = internalConfigPtr->netName;
/* assign config to communicator */
comm->blocking = internalConfigPtr->blocking;
comm->cgaClusterSize = internalConfigPtr->cgaClusterSize;
comm->minCTAs = internalConfigPtr->minCTAs;
comm->maxCTAs = internalConfigPtr->maxCTAs;
/* override configuration from env variable. */
blockingEnv = ncclParamCommBlocking();
if (blockingEnv == 0 || blockingEnv == 1)
comm->blocking = blockingEnv;
cgaClusterSizeEnv = ncclParamCGAClusterSize();
if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
comm->cgaClusterSize = cgaClusterSizeEnv;
} else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
comm->cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
}
minCTAsEnv = ncclParamMinCTAs();
if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
comm->minCTAs = minCTAsEnv;
}
maxCTAsEnv = ncclParamMaxCTAs();
if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
comm->maxCTAs = maxCTAsEnv;
}
/* cap channels if needed */
if (comm->minCTAs > MAXCHANNELS) {
WARN("minCTAs %d is larger than #channels upper limit %d", comm->minCTAs, MAXCHANNELS);
comm->minCTAs = MAXCHANNELS;
}
if (comm->maxCTAs > MAXCHANNELS) {
WARN("maxCTAs %d is larger than #channels upper limit %d", comm->maxCTAs, MAXCHANNELS);
comm->maxCTAs = MAXCHANNELS;
}
if (comm->minCTAs > comm->maxCTAs) {
WARN("minCTAs %d is larger than maxCTAs %d", comm->minCTAs, comm->maxCTAs);
ret = ncclInvalidArgument;
goto fail;
}
envNetName = getenv("NCCL_NET");
if (envNetName)
tmpNetName = envNetName;
if (tmpNetName != NULL) {
int netNameLen = strlen(tmpNetName) + 1;
comm->netName = (char*)malloc(netNameLen);
memcpy(comm->netName, tmpNetName, netNameLen);
} else {
comm->netName = NULL;
}
exit:
return ret;
fail:
goto exit;
}
static void ncclCommInitRankUndo(struct ncclAsyncJob* job_) {
@@ -1533,6 +1651,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
CUDACHECKGOTO(cudaFree(NULL), res, fail);
NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, fail);
NCCLCHECKGOTO(PtrCheck(config, "CommInitRank", "config"), res, fail);
if (nranks < 1 || myrank < 0 || myrank >= nranks) {
WARN("Invalid rank requested : %d/%d", myrank, nranks);
res = ncclInvalidArgument;
@@ -1584,12 +1703,13 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
if (ncclParamDmaBufEnable()) rocmLibraryInit();
int cudaDev;
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
CUDACHECK(cudaGetDevice(&cudaDev));
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, NULL, -1));
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config, -1));
return ncclSuccess;
}
@@ -1599,12 +1719,13 @@ ncclResult_t ncclCommInitRankMulti(ncclComm_t* newcomm, int nranks, ncclUniqueId
if (ncclParamDmaBufEnable()) rocmLibraryInit();
int cudaDev;
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
CUDACHECK(hipGetDevice(&cudaDev));
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, NULL, virtualId));
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config, virtualId));
return ncclSuccess;
}
@@ -1614,6 +1735,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
ncclResult_t ret = ncclSuccess;
int totalnDev;
int *gpuFlags = NULL;
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
@@ -1657,7 +1779,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
for (int i=0; i<ndev; i++) {
// Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, NULL, -1);
ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, &config, -1);
}
NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
@@ -1682,39 +1804,16 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
int cudaDev;
ncclResult_t ret = ncclSuccess;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t *internalConfigPtr;
size_t realSize;
int blockingEnv;
ncclConfig_t *internalConfigPtr = NULL;
NCCLCHECK(ncclGroupStartInternal());
internalConfigPtr = &internalConfig;
if (config) {
memcpy((void*)&realSize, (void*)config, sizeof(size_t));
realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize;
memcpy((void*)internalConfigPtr, (void*)config, realSize);
if (internalConfigPtr->magic != 0xcafebeef) {
WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER");
ret = ncclInvalidArgument;
goto exit;
}
}
/* check input config attributes */
if (internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) {
WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking);
ret = ncclInvalidArgument;
goto exit;
}
/* overwrite configuration from env variable. */
blockingEnv = ncclParamCommBlocking();
if (blockingEnv != 0 && blockingEnv != 1) {
WARN("Invalid NCCL_COMM_BLOCKING value %d", blockingEnv);
}
if (blockingEnv == 1) internalConfigPtr->blocking = blockingEnv;
if (ncclParamDmaBufEnable()) (void) rocmLibraryInit();
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, exit);
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, fail);
if (config == NULL)
internalConfigPtr = &internalConfig;
else
internalConfigPtr = config;
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr, -1), ret, fail);
exit:
+54 -1
View File
@@ -23,11 +23,33 @@ DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020);
/* proxy.cc */
DECLARE_CUDA_PFN(cuCtxCreate, 3020);
DECLARE_CUDA_PFN(cuCtxDestroy, 4000);
DECLARE_CUDA_PFN(cuCtxGetCurrent, 4000);
DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000);
DECLARE_CUDA_PFN(cuCtxGetDevice, 2000);
/* cuMem API support */
DECLARE_CUDA_PFN(cuMemAddressReserve, 10020);
DECLARE_CUDA_PFN(cuMemAddressFree, 10020);
DECLARE_CUDA_PFN(cuMemCreate, 10020);
DECLARE_CUDA_PFN(cuMemGetAllocationGranularity, 10020);
DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN(cuMemMap, 10020);
DECLARE_CUDA_PFN(cuMemRelease, 10020);
DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN(cuMemUnmap, 10020);
#if CUDA_VERSION >= 11070
/* transport/collNet.cc/net.cc*/
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
#endif
#if CUDA_VERSION >= 12010
/* NVSwitch Multicast support */
DECLARE_CUDA_PFN(cuMulticastAddDevice, 12010);
DECLARE_CUDA_PFN(cuMulticastBindMem, 12010);
DECLARE_CUDA_PFN(cuMulticastBindAddr, 12010);
DECLARE_CUDA_PFN(cuMulticastCreate, 12010);
DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010);
DECLARE_CUDA_PFN(cuMulticastUnbind, 12010);
#endif
#endif
/* CUDA Driver functions loaded with dlsym() */
@@ -39,6 +61,7 @@ DECLARE_CUDA_PFN(cuGetProcAddress, 11030);
static void *cudaLib;
int ncclCudaDriverVersionCache = -1;
bool ncclCudaLaunchBlocking = false;
#if CUDART_VERSION >= 11030
/*
@@ -62,9 +85,33 @@ static ncclResult_t cudaPfnFuncLoader(void) {
LOAD_SYM(cuMemGetAddressRange, 3020, 1);
LOAD_SYM(cuCtxCreate, 3020, 1);
LOAD_SYM(cuCtxDestroy, 4000, 1);
LOAD_SYM(cuCtxGetCurrent, 4000, 1);
LOAD_SYM(cuCtxSetCurrent, 4000, 1);
LOAD_SYM(cuCtxGetDevice, 2000, 1);
/* cuMem API support */
#if CUDA_VERSION >= 11030
LOAD_SYM(cuMemAddressReserve, 10020, 1);
LOAD_SYM(cuMemAddressFree, 10020, 1);
LOAD_SYM(cuMemCreate, 10020, 1);
LOAD_SYM(cuMemGetAllocationGranularity, 10020, 1);
LOAD_SYM(cuMemExportToShareableHandle, 10020, 1);
LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1);
LOAD_SYM(cuMemMap, 10020, 1);
LOAD_SYM(cuMemRelease, 10020, 1);
LOAD_SYM(cuMemSetAccess, 10020, 1);
LOAD_SYM(cuMemUnmap, 10020, 1);
#endif
#if CUDA_VERSION >= 11070
LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
#endif
#if CUDA_VERSION >= 12010
/* NVSwitch Multicast support */
LOAD_SYM(cuMulticastAddDevice, 12010, 1);
LOAD_SYM(cuMulticastBindMem, 12010, 1);
LOAD_SYM(cuMulticastBindAddr, 12010, 1);
LOAD_SYM(cuMulticastCreate, 12010, 1);
LOAD_SYM(cuMulticastGetGranularity, 12010, 1);
LOAD_SYM(cuMulticastUnbind, 12010, 1);
#endif
return ncclSuccess;
}
@@ -74,6 +121,11 @@ static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
static ncclResult_t initResult;
static void initOnceFunc() {
do {
char* val = getenv("CUDA_LAUNCH_BLOCKING");
ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
} while (0);
CUresult res;
/*
* Load CUDA driver library
@@ -85,9 +137,10 @@ static void initOnceFunc() {
else
snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
(void) dlerror(); // Clear any previous errors
cudaLib = dlopen(path, RTLD_LAZY);
if (cudaLib == NULL) {
WARN("Failed to find CUDA library (NCCL_CUDA_PATH='%s') : %s", ncclCudaPath ? ncclCudaPath : "", dlerror());
WARN("Failed to find CUDA library %s (NCCL_CUDA_PATH='%s') : %s", path, ncclCudaPath ? ncclCudaPath : "", dlerror());
goto error;
}
+200
View File
@@ -0,0 +1,200 @@
/*
* Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
*
* See COPYRIGHT for license information
*/
#include "ipcsocket.h"
#include "utils.h"
#include <stdlib.h>
#include <string.h>
#include <errno.h>
// Enable Linux abstract socket naming
#define USE_ABSTRACT_SOCKET
#define NCCL_IPC_SOCKNAME_STR "/tmp/nccl-socket-%d-%lx"
/*
* Create a Unix Domain Socket
*/
ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) {
int fd = -1;
struct sockaddr_un cliaddr;
char temp[NCCL_IPC_SOCKNAME_LEN] = "";
if (handle == NULL) {
return ncclInternalError;
}
handle->fd = -1;
handle->socketName[0] = '\0';
if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
WARN("UDS: Socket creation error : %d", errno);
return ncclSystemError;
}
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
// Create unique name for the socket.
int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
if (len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot bind provided name to socket. Name too large");
return ncclInternalError;
}
#ifndef USE_ABSTRACT_SOCKET
unlink(temp);
#endif
TRACE(NCCL_INIT, "UDS: Creating socket %s", temp);
strncpy(cliaddr.sun_path, temp, len);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
WARN("UDS: Binding to socket %s failed : %d", temp, errno);
close(fd);
return ncclSystemError;
}
handle->fd = fd;
strcpy(handle->socketName, temp);
handle->abortFlag = abortFlag;
// Mark socket as non-blocking
if (handle->abortFlag) {
int flags;
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
}
return ncclSuccess;
}
ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
if (handle == NULL) {
return ncclInternalError;
}
if (handle->fd <= 0) {
return ncclSuccess;
}
#ifndef USE_ABSTRACT_SOCKET
if (handle->socketName[0] != '\0') {
unlink(handle->socketName);
}
#endif
close(handle->fd);
return ncclSuccess;
}
ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
struct iovec iov[1];
// Union to guarantee alignment requirements for control array
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr *cmptr;
char dummy_buffer[1];
int ret;
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
iov[0].iov_base = (void *)dummy_buffer;
iov[0].iov_len = sizeof(dummy_buffer);
msg.msg_iov = iov;
msg.msg_iovlen = 1;
while ((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Receiving data over socket failed : %d", errno);
return ncclSystemError;
}
if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
}
if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
WARN("UDS: Receiving data over socket failed");
return ncclSystemError;
}
memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
} else {
WARN("UDS: Receiving data over socket %s failed", handle->socketName);
return ncclSystemError;
}
TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
return ncclSuccess;
}
ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
struct msghdr msg;
struct iovec iov[1];
char temp[NCCL_IPC_SOCKNAME_LEN];
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr *cmptr;
struct sockaddr_un cliaddr;
// Construct client address to send this shareable handle to
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
if (len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot connect to provided name for socket. Name too large");
return ncclInternalError;
}
(void) strncpy(cliaddr.sun_path, temp, len);
TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
msg.msg_name = (void *)&cliaddr;
msg.msg_namelen = sizeof(struct sockaddr_un);
iov[0].iov_base = (void *)"";
iov[0].iov_len = 1;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_flags = 0;
ssize_t sendResult;
while ((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Sending data over socket %s failed : %d", temp, errno);
return ncclSystemError;
}
if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
}
return ncclSuccess;
}
+6
View File
@@ -24,8 +24,14 @@ static enum { hsaUninitialized, hsaInitializing, hsaInitialized, hsaError } hsaS
static void *hsaLib;
static uint16_t version_major, version_minor;
bool ncclCudaLaunchBlocking = false;
ncclResult_t rocmLibraryInit(void) {
do {
char* val = getenv("CUDA_LAUNCH_BLOCKING");
ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
} while (0);
hsa_status_t res;
if (hsaState == hsaInitialized)
+28 -4
View File
@@ -51,7 +51,7 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
int closed;
NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed));
if (closed) {
char line[SOCKET_NAME_MAXLEN+1];
WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
@@ -827,23 +827,47 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
}
// Receive or detect connection closed
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) {
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) {
int offset = 0;
if (sock == NULL) {
WARN("ncclSocketTryRecv: pass NULL socket");
return ncclInvalidArgument;
}
*closed = 0;
while (offset < size) {
// Block until connection closes or nbytes received
if (blocking) {
while (offset < size) {
NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
if (*closed) return ncclSuccess;
}
} else {
NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
if (*closed) return ncclSuccess;
// If any bytes were received, block waiting for the rest
if (offset > 0) {
while (offset < size) {
NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
if (*closed) return ncclSuccess;
}
// No bytes were received, return ncclInProgress
} else {
return ncclInProgress;
}
}
return ncclSuccess;
}
ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
if (sock != NULL) {
if (sock->fd >= 0) close(sock->fd);
if (sock->fd >= 0) {
/* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
* by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
* the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
* connection close here. */
shutdown(sock->fd, SHUT_RDWR);
close(sock->fd);
}
sock->state = ncclSocketStateClosed;
sock->fd = -1;
}
+15 -2
View File
@@ -30,7 +30,9 @@ extern "C" {
#endif
/*! @brief Opaque handle to communicator */
#include <limits.h>
typedef struct ncclComm* ncclComm_t;
#define NCCL_COMM_NULL NULL
#define NCCL_UNIQUE_ID_BYTES 128
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
@@ -46,15 +48,22 @@ typedef enum { ncclSuccess = 0,
ncclInProgress = 7,
ncclNumResults = 8 } ncclResult_t;
#define NCCL_CONFIG_UNDEF_INT INT_MIN
#define NCCL_CONFIG_UNDEF_PTR NULL
/* Communicator configuration. Users can assign value to attributes to specify the
* behavior of a communicator. */
typedef struct ncclConfig_v21400 {
typedef struct ncclConfig_v21700 {
/* attributes that users should never touch. */
size_t size;
unsigned int magic;
unsigned int version;
/* attributes that users are able to customize. */
int blocking;
int cgaClusterSize;
int minCTAs;
int maxCTAs;
const char *netName;
} ncclConfig_t;
/* Config initializer must be assigned to initialize config structure when it is created.
@@ -63,7 +72,11 @@ typedef struct ncclConfig_v21400 {
sizeof(ncclConfig_t), /* size */ \
0xcafebeef, /* magic */ \
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
1 /* blocking */ \
NCCL_CONFIG_UNDEF_INT, /* blocking */ \
NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
NCCL_CONFIG_UNDEF_PTR /* netName */ \
}
/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
+28 -16
View File
@@ -183,14 +183,8 @@ ncclResult_t ncclNetPluginInit() {
}
void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
if (netPluginLib == nullptr) {
// dlopen does not guarantee to set errno, but dlerror only gives us a
// string, so checking errno doesn't hurt to try to provide a better
// error message
if (errno == ENOENT) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName);
} else {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
}
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load (%s) returned %d : %s", ncclNetPluginName, errno, dlerror());
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found, using internal implementation");
return ncclSuccess;
}
@@ -271,9 +265,10 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
ncclResult_t ncclNetInit(struct ncclComm* comm) {
// Initialize main communication network
char* netName = getenv("NCCL_NET");
char* netName;
bool ok = false;
netName = comm->netName;
for (int i=0; i<3; i++) {
if (ncclNets[i] == nullptr) continue;
enum ncclNetState state;
@@ -335,9 +330,26 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
ncclResult_t ret;
ncclDebugNoWarn = NCCL_NET;
NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3);
CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
bool connected;
connected = false;
while (!connected) {
// If we're aborting now, skip to cleanup
if (*comm->abortFlag) {
goto cleanup2;
}
if (sComm == NULL)
NCCLCHECKGOTO(ncclNetConnect(comm, dev, &handle, &sComm), ret, cleanup2);
if (rComm == NULL)
NCCLCHECKGOTO(ncclNetAccept(comm, lComm, &rComm), ret, cleanup2);
connected = (rComm != NULL) && (sComm != NULL);
}
CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
@@ -346,11 +358,11 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
}
ncclDebugNoWarn = 0;
CUDACHECK(cudaFree(gpuPtr));
cleanup4:
NCCLCHECK(ncclNetCloseRecv(comm, rComm));
cleanup3:
NCCLCHECK(ncclNetCloseSend(comm, sComm));
cleanup2:
if (rComm != NULL)
NCCLCHECK(ncclNetCloseRecv(comm, rComm));
if (sComm != NULL)
NCCLCHECK(ncclNetCloseSend(comm, sComm));
NCCLCHECK(ncclNetCloseListen(comm, lComm));
cleanup1:
break;
+375 -60
View File
@@ -16,6 +16,7 @@
#include "timer.h"
#include <sys/syscall.h>
#include <assert.h>
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
@@ -37,6 +38,155 @@ struct ncclProxyPool {
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
};
static void expectedProxyResponseFree(struct ncclProxyState* state) {
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
struct ncclExpectedProxyResponse* prev = NULL;
while (elem) {
prev = elem;
elem = elem->next;
free(prev->respBuff);
free(prev);
}
}
static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize) {
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
while (elem) {
if (elem->opId == opId) {
if (respSize != elem->respSize) {
WARN("Mismatched response size for opId=%p", opId);
return ncclInternalError;
}
if (elem->done) {
WARN("Storing response for already completed opId=%p", opId);
return ncclInternalError;
}
memcpy(elem->respBuff, respBuff, respSize);
elem->done = true;
return ncclSuccess;
}
elem = elem->next;
}
WARN("Proxy response for opId=%p doesn't match any expected response", opId);
return ncclInternalError;
}
static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, void* opId, int respSize, void* respData, int respDataSize) {
struct ncclExpectedProxyResponse* ex;
NCCLCHECK(ncclCalloc(&ex, 1));
ex->opId = opId;
// Pre-alloc response buffer
ex->respBuff = malloc(respSize);
ex->respSize = respSize;
ex->done = false;
if (respData) {
memcpy(ex->respBuff, respData, respDataSize);
ex->done = true;
}
// Enqueue
struct ncclExpectedProxyResponse* list = state->expectedResponses;
if (list == NULL) {
state->expectedResponses = ex;
return ncclSuccess;
}
while (list->next) list = list->next;
list->next = ex;
return ncclSuccess;
}
static ncclResult_t expectedProxyResponseDequeue(struct ncclProxyState* state, void* opId, void* respBuff, int* found) {
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
struct ncclExpectedProxyResponse* prev = NULL;
*found = 0;
while (elem) {
if ((elem->opId == opId) && elem->done) {
if (prev == NULL) {
state->expectedResponses = elem->next;
} else {
prev->next = elem->next;
}
memcpy(respBuff, elem->respBuff, elem->respSize);
free(elem->respBuff);
free(elem);
*found = 1;
return ncclSuccess;
}
prev = elem;
elem = elem->next;
}
return ncclSuccess;
}
static ncclResult_t expectedProxyResponseRemove(struct ncclProxyState* state, void* opId) {
struct ncclExpectedProxyResponse* elem = state->expectedResponses;
struct ncclExpectedProxyResponse* prev = NULL;
while (elem) {
if (elem->opId == opId) {
if (prev == NULL) {
state->expectedResponses = elem->next;
} else {
prev->next = elem->next;
}
free(elem->respBuff);
free(elem);
return ncclSuccess;
}
prev = elem;
elem = elem->next;
}
WARN("Couldn't find opId=%p", opId);
return ncclInternalError;
}
static ncclResult_t asyncProxyOpEnqueue(struct ncclProxyLocalPeer* peer, ncclProxyAsyncOp* op) {
ncclProxyAsyncOp* list = peer->asyncOps;
if (list == NULL) {
peer->asyncOps = op;
return ncclSuccess;
}
while (list->next) list = list->next;
list->next = op;
return ncclSuccess;
}
static ncclResult_t asyncProxyOpDequeue(struct ncclProxyLocalPeer* peer, ncclProxyAsyncOp* op) {
struct ncclProxyAsyncOp* elem = peer->asyncOps;
struct ncclProxyAsyncOp* prev = NULL;
while (elem) {
if (elem->opId == op->opId) {
if (prev == NULL) {
peer->asyncOps = elem->next;
} else {
prev->next = elem->next;
}
if (elem->reqBuff) {
free(elem->reqBuff);
}
if (elem->respBuff) {
free(elem->respBuff);
}
free(elem);
return ncclSuccess;
}
prev = elem;
elem = elem->next;
}
if (op) {
WARN("Attempting to dequeue nonexistent async opId=%p", op->opId);
} else {
WARN("Attempting to dequeue null operation");
}
return ncclInternalError;
}
static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) {
struct ncclProxyArgs* elem;
if (state->pool == NULL) {
@@ -86,7 +236,7 @@ ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState*
pool = pool->next;
p++;
}
WARN("Could not find pool of op %p\n", op);
WARN("Could not find pool of op %p", op);
return ncclInternalError;
}
@@ -140,7 +290,7 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
nextOp->state |= OP_SEEN;
printf("\n");
if (nextOp->next) {
WARN("Inactive op has next set!\n");
WARN("Inactive op has next set!");
}
nextOp = nextOp->nextPeer;
}
@@ -337,7 +487,7 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector*
}
}
if (lastOp == -1) {
WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)\n", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
return ncclInternalError;
}
// Cut chain at lastOp
@@ -775,19 +925,6 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclComm* comm) {
return ncclSuccess;
}
struct ncclProxyAsyncOp {
int type;
struct ncclProxyConnection* connection;
int reqSize, respSize;
char *reqBuff, *respBuff;
};
struct ncclProxyLocalPeer {
struct ncclSocket sock;
int localRank;
struct ncclProxyAsyncOp asyncOps;
};
#define NCCL_PROXY_CONN_POOL_SIZE_POW2 7
#define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2))
#define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1)
@@ -795,7 +932,6 @@ struct ncclProxyConnectionPool {
struct ncclProxyConnection** pools;
int banks;
int offset;
struct ncclProxyAsyncOp* ops;
};
static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) {
@@ -893,26 +1029,137 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
return ncclSuccess;
}
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" };
ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd" };
ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
struct ncclSocket* sock;
ncclResult_t ret = ncclSuccess;
void* respData = NULL;
int respDataSize = 0;
struct ncclComm* comm = proxyConn->comm;
struct ncclIpcSocket ipcSock = { 0 };
if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError;
sock = proxyConn->comm->proxyState.peerSocks + proxyConn->localRank;
if (*comm->abortFlag != 0) {
WARN("ncclProxyCallAsync() - Saw abortFlag while waiting for proxyThread response");
return ncclInternalError;
}
if (comm->proxyState.peerSocks == NULL) return ncclInternalError;
sock = comm->proxyState.peerSocks + proxyConn->localRank;
if (sock == NULL) return ncclInternalError;
if (type == ncclProxyMsgConvertFd) {
// cuMem API support
// Create a UDS socket to receive the converted fd
NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->localRank, (uint64_t)proxyConn->connection, comm->abortFlag));
}
NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error);
if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error);
if (respSize) NCCLCHECKGOTO(ncclSocketRecv(sock, respBuff, respSize), ret, error);
if (type == ncclProxyMsgConvertFd) {
// cuMem API support
int recvFd = -1;
if (reqSize != sizeof(int) || respSize != sizeof(int)) return ncclInternalError;
// Receive converted fd over UDS
NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, &recvFd));
TRACE(NCCL_NET, "UDS: ConvertFd rank %d returned %p %d", proxyConn->localRank, &recvFd, recvFd);
assert(recvFd != -1);
respData = &recvFd;
respDataSize = sizeof(recvFd);
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
} else {
// Send opId to proxy
NCCLCHECKGOTO(ncclSocketSend(sock, &opId, sizeof(opId)), ret, error);
}
// Add proxyOp to expected response queue
NCCLCHECK(expectedProxyResponseEnqueue(&comm->proxyState, opId, respSize, respData, respDataSize));
return ncclSuccess;
error:
WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
WARN("Proxy Call to rank %d failed (%s)", comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
return ret;
}
ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) {
struct ncclComm* comm = proxyConn->comm;
// Receive the connection pointer from the Proxy
if (*comm->abortFlag) {
WARN("Comm %p is in abort state", comm);
return ncclInternalError;
}
if (comm->proxyState.peerSocks == NULL) return ncclInternalError;
// Check response queue
int found = 0;
NCCLCHECK(expectedProxyResponseDequeue(&comm->proxyState, opId, respBuff, &found));
if (found == 0) {
// Attempt to read in a new response header from the proxy thread
struct ncclSocket* sock = comm->proxyState.peerSocks + proxyConn->localRank;
void* recvOpId;
int offset = 0;
if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)) {
WARN("Socket recv failed while polling for opId=%p", opId);
return ncclInternalError;
}
if (offset == 0) {
return ncclInProgress;
// If we've returned a partial response, block to receive the rest of it
} else if (offset < sizeof(recvOpId)) {
while (offset < sizeof(recvOpId))
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset));
}
INFO(NCCL_PROXY, "ncclPollProxyResponse Recieved new opId=%p", recvOpId);
// Now do a blocking recv of the response size
int respSize = 0;
NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(respSize)));
// If there's a respSize to recv
if (respSize > 0) {
NCCLCHECK(ncclSocketRecv(sock, respBuff, respSize));
}
if (recvOpId == opId) {
INFO(NCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId);
NCCLCHECK(expectedProxyResponseRemove(&comm->proxyState, recvOpId));
return ncclSuccess;
} else {
INFO(NCCL_PROXY, "Queing opId=%p", recvOpId);
// Store the result and mark response as completed
NCCLCHECK(expectedProxyResponseStore(&comm->proxyState, recvOpId, respBuff, respSize));
return ncclInProgress;
}
} else {
INFO(NCCL_PROXY, "ncclPollProxyResponse Dequeued cached opId=%p", opId);
}
return ncclSuccess;
}
ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
// Alloc some memory to act as a handle
void* opId = malloc(1);
NCCLCHECK(ncclProxyCallAsync(proxyConn, type, reqBuff, reqSize, respSize, opId));
ncclResult_t res = ncclInProgress;
while (res == ncclInProgress) {
res = ncclPollProxyResponse(proxyConn, respBuff, opId);
}
free(opId);
return res;
}
static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
if (state->opsPool == NULL) {
@@ -1003,16 +1250,55 @@ static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct
if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError;
int nChannels;
NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
// Store opId for completion response
void* opId;
NCCLCHECK(ncclSocketRecv(sock, &opId, sizeof(opId)));
INFO(NCCL_PROXY, "proxyConnSharedInit received opId=%p", opId);
if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
__atomic_store_n(&connection->state, connSharedInitialized, __ATOMIC_RELEASE);
// Send the opId for referencing async operation
INFO(NCCL_PROXY, "proxyConnSharedInit::ncclSocketSend(opId=%p)", opId);
NCCLCHECK(ncclSocketSend(connection->sock, &opId, sizeof(opId)));
// Send the response size
INFO(NCCL_PROXY, "proxyConnSharedInit::ncclSocketSend(op.respSize=%d)", respSize);
NCCLCHECK(ncclSocketSend(connection->sock, &respSize, sizeof(respSize)));
return ncclSuccess;
}
static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount) {
// cuMem API support
static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, struct ncclComm* comm) {
struct ncclSocket* sock = &peer->sock;
uint64_t connection;
NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(uint64_t)));
int reqSize, respSize;
NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int)));
NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int)));
if (reqSize != sizeof(int) || respSize != sizeof(int)) return ncclInternalError;
int fd;
struct ncclIpcSocket ipcSock = { 0 };
NCCLCHECK(ncclSocketRecv(sock, &fd, sizeof(int)));
INFO(NCCL_NET, "UDS: proxyConvertFd received fd %d peer %d connection %lx", fd, peer->localRank, connection);
// Send back the converted fd using UDS
NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->localRank, connection^1, comm->abortFlag));
NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->localRank, connection));
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
return ncclSuccess;
}
static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount, struct ncclProxyLocalPeer* peer) {
int done = 1;
if (op->type == ncclProxyMsgSetup) {
INFO(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId);
NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
} else if (op->type == ncclProxyMsgConnect) {
INFO(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
} else return ncclInternalError;
if (done) {
@@ -1020,31 +1306,38 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclC
__atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE);
else if (op->type == ncclProxyMsgConnect)
__atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE);
/* if setup or connect is done, we should not return any error at this point since
/* if setup or connect is done, we should not return any error at this point since
* ncclSocketSend might already send the respBuff to the requester. If we still choose
* to abort and close the connection, it can cause segfault if the requester is using
* the respBuff. */
if (op->respSize) ncclSocketSend(op->connection->sock, op->respBuff, op->respSize);
if (op->reqBuff) {
free(op->reqBuff);
op->reqBuff = NULL;
// Send the opId for referencing async operation
NCCLCHECK(ncclSocketSend(op->connection->sock, &op->opId, sizeof(op->opId)));
// Send the response size
NCCLCHECK(ncclSocketSend(op->connection->sock, &op->respSize, sizeof(op->respSize)));
if (op->respSize) {
// Send the response
NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize));
}
if (op->respBuff) {
free(op->respBuff);
op->respBuff = NULL;
}
op->type = 0;
asyncProxyOpDequeue(peer, op);
(*asyncOpCount)--;
return ncclSuccess;
} else if (*comm->abortFlag != 0) {
return ncclInternalError;
}
return ncclSuccess;
return ncclInProgress;
}
static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) {
struct ncclSocket* sock = &peer->sock;
struct ncclProxyAsyncOp* asyncOp = &peer->asyncOps;
struct ncclProxyAsyncOp* asyncOp;
NCCLCHECK(ncclCalloc(&asyncOp, 1));
asyncOp->type = type;
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
@@ -1054,9 +1347,16 @@ static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* p
NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
}
// Store opId for completion response
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)));
if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize));
asyncProxyOpEnqueue(peer, asyncOp);
(*asyncOpCount)++;
NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount));
NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount, peer));
return ncclSuccess;
}
@@ -1086,7 +1386,7 @@ void* ncclProxyService(void* _args) {
pollfds[s].events = POLLHUP|POLLIN;
}
if (ncclSocketGetFd(comm->proxyState.listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
WARN("[Proxy Service] Get listenSock fd fails\n");
WARN("[Proxy Service] Get listenSock fd fails");
return NULL;
};
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
@@ -1118,14 +1418,14 @@ void* ncclProxyService(void* _args) {
}
if (maxnpeers < s+1) maxnpeers = s+1;
if (ncclSocketInit(&peers[s].sock) != ncclSuccess) {
WARN("[Service thread] Initialize peers[%d].sock fails\n", s);
WARN("[Service thread] Initialize peers[%d].sock fails", s);
return NULL;
}
if (ncclSocketAccept(&peers[s].sock, comm->proxyState.listenSock) != ncclSuccess) {
WARN("[Service thread] Accept failed %s", strerror(errno));
} else {
if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) {
WARN("[Service thread] Get peers[%d].sock fd fails\n", s);
WARN("[Service thread] Get peers[%d].sock fd fails", s);
return NULL;
}
npeers++;
@@ -1135,25 +1435,37 @@ void* ncclProxyService(void* _args) {
for (int s=0; s<maxnpeers; s++) {
struct ncclProxyLocalPeer* peer = peers+s;
struct ncclSocket* sock = &peer->sock;
struct ncclProxyAsyncOp* op = &peer->asyncOps;
int closeConn = 0;
int type = 0;
ncclResult_t res = ncclSuccess;
if (pollfds[s].fd == -1) continue;
if (op->type != 0) {
res = proxyProgressAsync(op, comm, &asyncOpCount);
// Progress all ops for this ncclProxyLocalPeer
ncclProxyAsyncOp* op = peer->asyncOps;
while (op != nullptr) {
type = op->type;
if (res != ncclSuccess) closeConn = 1;
} else if (pollfds[s].revents & POLLIN) {
res = proxyProgressAsync(op, comm, &asyncOpCount, peer);
if (res == ncclSuccess || res == ncclInProgress) {
op = op->next;
} else {
// Res is a bad result
closeConn = 1;
WARN("[Service thread] Error encountered progressing operation=%s, res=%d, closing connection", ncclProxyMsgTypeStr[type], res);
break;
}
}
// Check for additional ops coming in
if (pollfds[s].revents & POLLIN) {
int closed;
if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) {
WARN("[Service thread] Could not receive type from localRank %d", peer->localRank);
res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/);
if (res != ncclSuccess && res != ncclInProgress) {
WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->localRank, res, closed);
closeConn = 1;
} else if (closed) {
INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank);
closeConn = 1;
} else {
} else if (res == ncclSuccess) { // We received something from the sock
if (type == ncclProxyMsgStop) {
stop = 1;
closeConn = 1;
@@ -1164,30 +1476,32 @@ void* ncclProxyService(void* _args) {
} else if (type == ncclProxyMsgSharedInit) {
res = proxyConnSharedInit(peers+s, &connectionPool, comm);
} else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) {
INFO(NCCL_PROXY, "proxyConnSetupConnect for peer->localRank %d,", peer->localRank);
res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount);
} else if (type == ncclProxyMsgConvertFd) {
res = proxyConvertFd(peers+s, comm); // cuMem API support
} else {
WARN("[Service thread] Unknown command %d from localRank %d\n", type, peer->localRank);
WARN("[Service thread] Unknown command %d from localRank %d", type, peer->localRank);
closeConn = 1;
}
INFO(NCCL_PROXY, "Received and initiated operation=%s res=%d", ncclProxyMsgTypeStr[type], res);
}
} else if (pollfds[s].revents & POLLHUP) {
closeConn = 1;
}
if (res != ncclSuccess) {
}
if (res != ncclSuccess && res != ncclInProgress) {
WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res);
closeConn = 1;
}
if (closeConn) {
ncclSocketClose(sock);
if (op->reqBuff) {
free(op->reqBuff);
op->reqBuff = NULL;
if (op != nullptr) {
asyncProxyOpDequeue(peer, op);
asyncOpCount--;
}
if (op->respBuff) {
free(op->respBuff);
op->respBuff = NULL;
}
op->type = 0;
pollfds[s].fd = -1;
npeers--;
}
@@ -1255,6 +1569,7 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
free(state->peerSocks);
free(state->proxyOps);
free(state->sharedDevMems);
expectedProxyResponseFree(state);
}
return ncclSuccess;
}
+96 -47
View File
@@ -82,9 +82,12 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
ncclResult_t ret = ncclSuccess;
int highestType = TRANSPORT_P2P; // track highest transport type
struct ncclConnect data[2*MAXCHANNELS];
struct ncclConnect** data = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Store intermediate send/recvData structs for connect
struct ncclConnect** recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given recv connection within a channel
struct ncclConnect** sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given send connection within a channel
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
// First time initialization
for (int i=1; i<comm->nRanks; i++) {
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
@@ -92,22 +95,28 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
uint64_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
uint64_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
struct ncclConnect* recvData = data;
// Data[i] contains all ncclConnect information for all send and receive connections with a given send and recv peer
// This data is packed in the array based on the number of sendChannels and recvChannels connected with these peers
// The first N entries contain recvData, connection information for recv connections
// The next M entries contain sendData, connection information for send connections
// It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
data[i] = (ncclConnect*) malloc(sizeof(ncclConnect) * 2*MAXCHANNELS);
recvData[i] = data[i];
int sendChannels = 0, recvChannels = 0;
int type;
TIME_START(0);
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1UL<<c)) {
NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData[i]+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
if (type > highestType) highestType = type;
}
}
TIME_STOP(0);
TIME_START(1);
struct ncclConnect* sendData = recvData+recvChannels;
sendData[i] = recvData[i]+recvChannels;
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1UL<<c)) {
NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData[i]+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
if (type > highestType) highestType = type;
}
}
@@ -116,54 +125,94 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
TIME_START(2);
if (sendPeer == recvPeer) {
if (recvChannels+sendChannels) {
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
sendData = data;
recvData = data+sendChannels;
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
sendData[i] = data[i];
recvData[i] = data[i]+sendChannels;
}
} else {
if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail);
if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail);
if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail);
if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail);
}
TIME_STOP(2);
TIME_START(3);
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn), ret, fail);
conn->connected = 1;
do {
struct ncclConnInfo connInfo;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(cudaMemcpyAsync(&connInfo, &comm->channels[c].devPeers[sendPeer].send[connIndex], sizeof(struct ncclConnInfo), cudaMemcpyDeviceToHost, comm->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(hipStreamSynchronize(comm->hostStream.cudaStream), ret, fail);
if (memcmp(&connInfo, &conn->conn, sizeof(struct ncclConnInfo)) == 0) break;
} while (true);
}
}
TIME_STOP(3);
TIME_START(4);
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn), ret, fail);
conn->connected = 1;
do {
struct ncclConnInfo connInfo;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(cudaMemcpyAsync(&connInfo, &comm->channels[c].devPeers[recvPeer].recv[connIndex], sizeof(struct ncclConnInfo), cudaMemcpyDeviceToHost, comm->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(hipStreamSynchronize(comm->hostStream.cudaStream), ret, fail);
if (memcmp(&connInfo, &conn->conn, sizeof(struct ncclConnInfo)) == 0) break;
} while (true);
}
}
TIME_STOP(4);
comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL;
}
// Loop until all channels with all ranks have been connected
bool allChannelsConnected;
allChannelsConnected = false;
while (!allChannelsConnected) {
allChannelsConnected = true;
for (int i=1; i<comm->nRanks; i++) {
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
uint64_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
uint64_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
int sendDataOffset = 0;
int recvDataOffset = 0;
for (int c=0; c<MAXCHANNELS; c++) {
TIME_START(3);
if (sendMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
// This connector hasn't completed connection yet
if (conn->connected == 0) {
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
if (ret == ncclSuccess) {
conn->connected = 1;
do {
struct ncclConnInfo connInfo;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(cudaMemcpyAsync(&connInfo, &comm->channels[c].devPeers[sendPeer].send[connIndex], sizeof(struct ncclConnInfo), cudaMemcpyDeviceToHost, comm->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(hipStreamSynchronize(comm->hostStream.cudaStream), ret, fail);
if (memcmp(&connInfo, &conn->conn, sizeof(struct ncclConnInfo)) == 0) break;
} while (true);
} else if (ret == ncclInProgress) {
allChannelsConnected = false;
}
}
}
TIME_STOP(3);
// Start with recv channels
TIME_START(4);
if (recvMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
// This connector hasn't completed connection yet
if (conn->connected == 0) {
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
if (ret == ncclSuccess) {
conn->connected = 1;
do {
struct ncclConnInfo connInfo;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(cudaMemcpyAsync(&connInfo, &comm->channels[c].devPeers[recvPeer].recv[connIndex], sizeof(struct ncclConnInfo), cudaMemcpyDeviceToHost, comm->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(hipStreamSynchronize(comm->hostStream.cudaStream), ret, fail);
if (memcmp(&connInfo, &conn->conn, sizeof(struct ncclConnInfo)) == 0) break;
} while (true);
} else if (ret == ncclInProgress) {
allChannelsConnected = false;
}
}
}
TIME_STOP(4);
}
}
}
// Clear all connect masks and free each connectInfo array
for (int i=1; i<comm->nRanks; i++) {
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL;
free(data[i]);
}
free(data);
free(sendData);
free(recvData);
if (highestTransportType != NULL) *highestTransportType = highestType;
TIME_PRINT("P2P Setup/Connect");
exit:
+11 -11
View File
@@ -155,13 +155,13 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
int proxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
// Determine whether we need to flush the GDR buffer on recv or not
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
@@ -174,12 +174,12 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
int proxyRank;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
recv->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
@@ -224,7 +224,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
// We're on the same process as the proxy. We can pass a pointer to a struct.
struct collNetConnectArgs args = { rank, nranks, connectInfos };
struct connectMap* map;
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
// If collnet connect failed, propagate error to fallback on regular p2p
if (map == NULL) return ncclSystemError;
@@ -250,7 +250,7 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
// We're on the same process as the proxy. We can pass a pointer to a struct.
struct collNetConnectArgs args = { rank, nranks, connectInfos };
struct connectMap* map;
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
// If collnet connect failed, propagate error to fallback on regular p2p
if (map == NULL) return ncclSystemError;
@@ -413,7 +413,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
}
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
@@ -429,7 +429,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
// Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
if (resources->collNetComm == NULL) {
*((struct connectMap**)respBuff) = NULL;
return ncclSuccess;
@@ -487,7 +487,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
}
static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
@@ -497,7 +497,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
// Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller.
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
if (resources->collNetComm == NULL) {
*((struct connectMap**)respBuff) = NULL;
return ncclSuccess;
@@ -556,7 +556,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
info->mhandles[p] = resources->mhandles[p];
if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
*((struct connectMap**)respBuff) = &resources->map;
return ncclSuccess;
}
+59 -21
View File
@@ -189,7 +189,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 1, &req.netDev));
if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) {
CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
send->conn.curr_hdp_reg = req.curr_hdp_reg;
@@ -199,7 +199,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
req.rank = myInfo->rank;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
req.remoteRank = peerInfo->rank;
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
if (proxyRank == myInfo->rank) {
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
@@ -241,8 +241,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
req.rank = myInfo->rank;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
req.remoteRank = peerInfo->rank;
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
return ncclSuccess;
@@ -287,11 +286,28 @@ static ncclResult_t netDumpMap(struct connectMap* map) {
}
static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
// Setup device pointers
struct connectMap* map;
NCCLCHECK(ncclCalloc(&map, 1));
send->transportResources = map;
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), map, sizeof(struct connectMap)));
struct connectMap* map = (connectMap*) send->transportResources;
void* opId;
// map isn't allocated thus this op hasn't been submitted yet
if (!map) {
// Setup device pointers
NCCLCHECK(ncclCalloc(&map, 1));
send->transportResources = map;
opId = send;
INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
NCCLCHECK(ncclProxyCallAsync(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId));
} else {
opId = send;
}
ncclResult_t ret;
NCCLCHECK(ret = ncclPollProxyResponse(&send->proxyConn, map, opId));
if (ret == ncclInProgress) {
return ret;
}
INFO(NCCL_PROXY, "sendConnect ncclPollProxyResponse opId=%p", opId);
if (map->sameProcess) {
if (map->cudaDev != comm->cudaDev) {
@@ -338,10 +354,26 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
/* Connect to this peer */
static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
struct connectMap* map;
NCCLCHECK(ncclCalloc(&map, 1));
recv->transportResources = map;
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), map, sizeof(struct connectMap)));
struct connectMap* map = (connectMap*) recv->transportResources;
void* opId;
if (!map) {
NCCLCHECK(ncclCalloc(&map, 1));
recv->transportResources = map;
// Use recv connector as unique identifier
opId = recv;
INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p",
opId, &recv->proxyConn, connectInfo);
NCCLCHECK(ncclProxyCallAsync(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId));
} else {
opId = recv;
}
ncclResult_t ret;
NCCLCHECK(ret = ncclPollProxyResponse(&recv->proxyConn, map, opId));
if (ret == ncclInProgress) {
return ret;
}
INFO(NCCL_PROXY, "recvConnect ncclPollProxyResponse opId=%p", opId);
//NCCLCHECK(netDumpMap(map));
struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
@@ -514,12 +546,14 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
*done = 1;
return ncclSuccess;
}
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
ncclResult_t ret = ncclSuccess;
if (resources->shared) {
// Shared buffers
@@ -539,21 +573,22 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
}
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId));
if (comms->sendComm[resources->channelId] == NULL) ret = ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId);
resources->netSendComm = comms->sendComm[resources->channelId];
if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
} else {
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
}
} else {
// Connect to remote peer
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
ret = ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm);
connection->proxyAppendPtr = &connection->proxyAppend;
}
NCCLCHECK(ret);
if (resources->netSendComm == NULL) {
*done = 0;
return ncclSuccess;
return ncclInProgress;
}
*done = 1;
@@ -666,6 +701,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
if (reqSize != sizeof(int)) return ncclInternalError;
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
resources->proxyRank = *(int*)reqBuff;
ncclResult_t ret = ncclSuccess;
// Finish connection establishment from remote peer
if (resources->shared) {
@@ -686,23 +722,25 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
}
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId));
if (comms->recvComm[resources->channelId] == NULL) ret = ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId);
resources->netRecvComm = comms->recvComm[resources->channelId];
if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
} else {
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
}
} else {
// Connect to remote peer
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
ret = ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm);
connection->proxyAppendPtr = &connection->proxyAppend;
}
NCCLCHECK(ret);
if (resources->netRecvComm == NULL) {
*done = 0;
return ncclSuccess;
return ncclInProgress;
}
*done = 1;
NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));
// Create structures
+50 -41
View File
@@ -385,7 +385,9 @@ enum ncclIbCommState {
ncclIbCommStateAccept = 3,
ncclIbCommStateSend = 4,
ncclIbCommStateRecv = 5,
ncclIbCommStateConnected = 6,
ncclIbCommStateConnecting = 6,
ncclIbCommStateConnected = 7,
ncclIbCommStatePendingReady = 8,
};
struct ncclIbCommStage {
@@ -633,8 +635,10 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
int ready;
*sendComm = NULL;
if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
if (stage->state == ncclIbCommStateSend) goto ib_send;
if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
if (stage->state == ncclIbCommStateSend) goto ib_send;
if (stage->state == ncclIbCommStateConnecting) goto ib_connect;
if (stage->state == ncclIbCommStateConnected) goto ib_send_ready;
if (stage->state != ncclIbCommStateStart) {
WARN("Error: trying to connect already connected sendComm");
return ncclInternalError;
@@ -698,11 +702,37 @@ ib_connect_check:
ib_send:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset));
if (stage->offset != sizeof(qpInfo))
return ncclSuccess;
if (stage->offset != sizeof(qpInfo)) return ncclSuccess;
stage->state = ncclIbCommStateConnecting;
stage->offset = 0;
// Clear the staging buffer for re-use
memset(stage->buffer, 0, sizeof(qpInfo));
ib_connect:
struct ncclIbQpInfo remQpInfo;
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, stage->buffer, sizeof(ncclIbQpInfo), &stage->offset));
if (stage->offset != sizeof(remQpInfo)) return ncclSuccess;
memcpy(&remQpInfo, stage->buffer, sizeof(ncclIbQpInfo));
for (int q=0; q<comm->nqps; q++) {
struct ibv_qp* qp = comm->qps[q];
NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
NCCLCHECK(ncclIbRtsQp(qp));
}
comm->ready = 1;
stage->state = ncclIbCommStateConnected;
stage->offset = 0;
ib_send_ready:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, &comm->ready, sizeof(int), &stage->offset));
if (stage->offset != sizeof(int)) return ncclSuccess;
free(stage->buffer);
stage->state = ncclIbCommStateConnected;
stage->state = ncclIbCommStateStart;
*sendComm = comm;
return ncclSuccess;
}
@@ -719,8 +749,9 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
if (stage->state == ncclIbCommStateRecv) goto ib_recv;
if (stage->state == ncclIbCommStateSend) goto ib_send;
if (stage->state == ncclIbCommStatePendingReady) goto ib_recv_ready;
if (stage->state != ncclIbCommStateStart) {
WARN("Listencomm in unknown state %d\n", stage->state);
WARN("Listencomm in unknown state %d", stage->state);
return ncclInternalError;
}
@@ -738,10 +769,10 @@ ib_accept_check:
stage->state = ncclIbCommStateRecv;
stage->offset = 0;
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo)));
ib_recv:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset));
if (stage->offset != sizeof(remQpInfo))
return ncclSuccess;
if (stage->offset != sizeof(remQpInfo)) return ncclSuccess;
/* copy back the received info */
memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo));
@@ -814,10 +845,18 @@ ib_recv:
if (stage->buffer) free(stage->buffer);
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo)));
memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo));
ib_send:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset));
if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess;
stage->offset = 0;
stage->state = ncclIbCommStatePendingReady;
ib_recv_ready:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, &rComm->ready, sizeof(int), &stage->offset));
if (stage->offset != sizeof(int)) return ncclSuccess;
free(stage->buffer);
*recvComm = rComm;
@@ -849,36 +888,6 @@ ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
return ncclSuccess;
}
ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
struct ncclIbQpInfo remQpInfo;
// Do not block on this receive, return if not ready.
int bytes = 0;
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
if (bytes == 0) return ncclSuccess; // Try again later
NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
for (int q=0; q<comm->nqps; q++) {
struct ibv_qp* qp = comm->qps[q];
NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
NCCLCHECK(ncclIbRtsQp(qp));
}
comm->ready = 1;
// Block until this is done. It *should* not block indefinitely.
NCCLCHECK(ncclSocketSend(&comm->sock, &comm->ready, sizeof(int)));
return ncclSuccess;
}
ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {
// Do not block on this receive, return if not ready.
int bytes = 0;
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
if (bytes == 0) return ncclSuccess; // Try again later
NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
return ncclSuccess;
}
ncclResult_t ncclIbTest(void* request, int* done, int* size);
/* DMA-BUF support */
@@ -1054,7 +1063,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm));
if (comm->ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->ready == 0"); return ncclInternalError; }
if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
struct ibv_mr* mr = (struct ibv_mr*)mhandle;
@@ -1187,7 +1196,7 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm));
if (comm->ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->ready == 0"); return ncclInternalError; }
if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
+373
View File
@@ -0,0 +1,373 @@
/*************************************************************************
* Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
// Implementation of the NVLink SHARP (NVLS) transport
#include "comm.h"
#include "graph.h"
#include "utils.h"
#include "proxy.h"
#if CUDART_VERSION >= 12010
// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange
#define USE_POSIX_FD 1
#if USE_POSIX_FD
#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
#else
#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE
#endif
ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
// This transport cannot be used for p2p
*ret = 0;
return ncclSuccess;
}
ncclResult_t nvlsSendFree(struct ncclConnector* send) {
return ncclSuccess;
}
ncclResult_t nvlsRecvFree(struct ncclConnector* recv) {
return ncclSuccess;
}
struct ncclTransport nvlsTransport = {
"NVLS",
nvlsCanConnect,
{ NULL, NULL, nvlsSendFree, NULL, NULL, NULL, NULL, NULL },
{ NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL }
};
#define NVLS_HANDLE_SIZE 64
struct nvlsResources {
CUmulticastObjectProp properties;
CUmemAccessDesc accessDesc;
int dev;
size_t size;
size_t granularity;
CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
char* mcBuff; // Multicast NVLS buffer address
CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
char* ucBuff; // Unicast NVLS buffer address
};
ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct nvlsResources* resources, int dev, int nranks, size_t size) {
CUmulticastObjectProp* prop = &resources->properties;
memset(prop, 0, sizeof(*prop));
prop->size = size;
prop->numDevices = nranks;
prop->handleTypes = NVLS_CU_MEM_HANDLE_TYPE;
prop->flags = 0;
// Could be changed to CU_MULTICAST_GRANULARITY_MINIMUM when 3418538 resolved
CUCHECK(cuMulticastGetGranularity(&resources->granularity, prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
ALIGN_SIZE(size, resources->granularity);
prop->size = resources->size = size;
memset(&resources->accessDesc, 0, sizeof(resources->accessDesc));
resources->accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
resources->accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
resources->accessDesc.location.id = dev;
resources->dev = dev;
return ncclSuccess;
}
ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct nvlsResources* resources, int rank, unsigned int nranks, char* shareableHandle) {
size_t size = resources->size;
// Create a Multicast group
CUmulticastObjectProp* prop = &resources->properties;
INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zi on rank %d", nranks, size, rank);
CUCHECK(cuMulticastCreate(&resources->mcHandle, prop));
if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) {
// Get a handle to pass to other ranks
CUCHECK(cuMemExportToShareableHandle(shareableHandle, resources->mcHandle, NVLS_CU_MEM_HANDLE_TYPE, 0));
}
else {
memcpy(shareableHandle, &resources->mcHandle, sizeof(resources->mcHandle));
}
INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zi on rank %d", resources->mcHandle, nranks, size, rank);
return ncclSuccess;
}
ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct nvlsResources* resources) {
INFO(NCCL_NVLS, "NVLS group %llx adding dev %d", resources->mcHandle, resources->dev);
CUCHECK(cuMulticastAddDevice(resources->mcHandle, resources->dev));
return ncclSuccess;
}
ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct nvlsResources* resources) {
int dev = resources->dev;
size_t size = resources->size;
INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev);
// Unbind physical memory from group for the given device
CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size));
return ncclSuccess;
}
ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct nvlsResources* resources, int rank, char* shareableHandle) {
CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
// Import and map the remote memory descriptor to the local GPU
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// cuMem UDS support
int fd = *(int *)shareableHandle;
TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle from rank %d fd %d", comm->localRank, rank, fd);
struct ncclProxyConnector proxyConn;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, rank, &proxyConn));
TRACE(NCCL_NVLS, "NVLS rank %d request conversion of fd %d from rank %d", comm->localRank, fd, rank);
NCCLCHECK(ncclProxyCallBlocking(&proxyConn, ncclProxyMsgConvertFd, shareableHandle, sizeof(int), &fd, sizeof(int)));
TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)(uintptr_t)fd, type));
} else {
if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) {
CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)shareableHandle, type));
} else {
memcpy(&resources->mcHandle, shareableHandle, sizeof(resources->mcHandle));
}
}
return ncclSuccess;
}
ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct nvlsResources* resources) {
size_t size = resources->size;
size_t granularity;
CUdeviceptr ptr = 0;
CUmemAllocationProp prop;
memset(&prop, 0, sizeof(prop));
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = resources->dev;
prop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
// Map a VA for UC memory
CUCHECK(cuMemAddressReserve(&ptr, size, granularity, 0U, 0));
// Alloc local physical mem for this NVLS group
CUCHECK(cuMemCreate(&resources->ucHandle, size, &prop, 0));
CUCHECK(cuMemMap(ptr, size, 0, resources->ucHandle, 0));
CUCHECK(cuMemSetAccess(ptr, size, &resources->accessDesc, 1));
CUDACHECK(cudaMemset((void*)ptr, 0, size));
resources->ucBuff = (char*)ptr;
INFO(NCCL_NVLS, "NVLS Mapped UC at %p size %zi", resources->ucBuff, size);
// Bind physical memory to the Multicast group
// NB: It will block until all ranks have been added to the Group
INFO(NCCL_NVLS, "NVLS Bind mem %p UC handle 0x%llx MC handle 0x%llx size %zi", (void*)ptr, resources->ucHandle, resources->mcHandle, size);
CUCHECK(cuMulticastBindMem(resources->mcHandle, 0/*mcOffset*/, resources->ucHandle, 0/*memOffset*/, size, 0/*flags*/));
return ncclSuccess;
}
ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct nvlsResources* resources) {
size_t size = resources->size;
CUdeviceptr ptr = 0;
// Create a VA for the NVLS
CUCHECK(cuMemAddressReserve(&ptr, size, resources->granularity, 0U, 0));
// Map the VA locally
CUCHECK(cuMemMap(ptr, size, 0, resources->mcHandle, 0));
resources->mcBuff = (char*)ptr;
INFO(NCCL_NVLS, "NVLS Mapped MC buffer at %p size %zi", resources->mcBuff, size);
// Having completed the BindMem we can now call SetAccess
// NB: It will block until all ranks have bound to the Group
CUCHECK(cuMemSetAccess((CUdeviceptr)resources->mcBuff, size, &resources->accessDesc, 1));
return ncclSuccess;
}
ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct nvlsResources* resources) {
size_t size;
CUdeviceptr ptr;
INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)",
resources->ucHandle, resources->ucBuff, resources->mcHandle, resources->mcBuff);
// Release the UC memory and mapping
ptr = (CUdeviceptr)resources->ucBuff;
size = resources->size;
CUCHECK(cuMemUnmap(ptr, size));
CUCHECK(cuMemAddressFree(ptr, size));
CUCHECK(cuMemRelease(resources->ucHandle));
// Release the MC memory and mapping
ptr = (CUdeviceptr)resources->mcBuff;
size = resources->size;
CUCHECK(cuMemUnmap(ptr, size));
CUCHECK(cuMemAddressFree(ptr, size));
CUCHECK(cuMemRelease(resources->mcHandle));
return ncclSuccess;
}
#include "bootstrap.h"
#include "channel.h"
#define NVLS_MEM_ALIGN_SIZE (1 << 21)
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 1);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
if (!ncclParamNvlsEnable() || comm->localRanks <= 1 || comm->nNodes>1) return ncclSuccess;
CUdevice dev;
int driverVersion;
if (CUPFN(cuDeviceGet) == NULL) return ncclSuccess;
CUCHECK(cuDeviceGet(&dev, comm->cudaDev));
CUDACHECK(cudaDriverGetVersion(&driverVersion));
comm->nvlsSupport = 0;
// NVLS Multicast support requires CUDA12.1 UMD + KMD
if (CUPFN(cuMulticastCreate) != NULL && driverVersion >= 12010) {
CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
}
INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
if (comm->nvlsSupport == 0) return ncclSuccess;
int nChannels = comm->nvlsChannels = std::max(comm->minCTAs, std::min(comm->maxCTAs, (int)ncclParamNvlsChannels()));
int rank = comm->localRank, nranks = comm->localRanks;
for (int c=0; c<nChannels; c++) {
NCCLCHECK(initChannel(comm, c));
}
ncclResult_t res = ncclSuccess;
struct nvlsResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
comm->nvlsResources = resources;
size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
size_t memSize = NVLS_MEM_ALIGN_SIZE;
size_t nvlsPerRankSize = nChannels*2*(buffSize+memSize);
size_t nvlsTotalSize = nvlsPerRankSize*nranks;
INFO(NCCL_INIT|NCCL_NVLS, "NVLS comm %p rank %d nranks %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi",
comm, rank, nranks, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
char* nvlsShareableHandle = NULL;
NCCLCHECKGOTO(ncclCalloc(&nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, nranks, nvlsTotalSize), res, cleanup);
if (rank == 0) {
NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, rank, nranks, nvlsShareableHandle), res, cleanup);
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
} else {
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, rank, nranks, 0, nvlsShareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, 0, nvlsShareableHandle), res, cleanup);
}
NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
// Local intra-node barrier to ensure everyone has bound their memory to the group
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.nHeads = nranks;
for (int i=0; i<NCCL_MAX_NVLS_ARITY; i++) channel->nvls.up[i] = -1;
channel->nvls.down = comm->nRanks+1+comm->localRank;
channel->nvls.out = -1; // Network not yet implemented.
channel->nvls.headRank = comm->localRank; // Network not yet implemented.
}
for (int r=0; r<nranks; r++) {
int nvlsPeer = comm->nRanks+1+r;
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.up[r] = nvlsPeer;
char* mem = NULL;
struct ncclChannelPeer* peer = channel->peers+nvlsPeer;
// Reduce UC -> MC
mem = resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize);
peer->send[0].transportComm = &nvlsTransport.send;
peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[0].conn.head = (uint64_t*)(mem+buffSize);
peer->send[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
mem = resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize);
peer->recv[1].transportComm = &nvlsTransport.recv;
peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[1].conn.head = (uint64_t*)(mem+buffSize);
peer->recv[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
peer->recv[1].conn.flags |= NCCL_NVLS_MIN_POLL;
// Broadcast MC -> UC
mem = resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
peer->recv[0].transportComm = &nvlsTransport.recv;
peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->recv[0].conn.head = (uint64_t*)(mem+buffSize);
peer->recv[0].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
mem = resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize);
peer->send[1].transportComm = &nvlsTransport.send;
peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
peer->send[1].conn.head = (uint64_t*)(mem+buffSize);
peer->send[1].conn.tail = (uint64_t*)(mem+buffSize+memSize/2);
peer->send[1].conn.flags |= NCCL_NVLS_MIN_POLL;
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[nvlsPeer].recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), res, cleanup);
/*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p",
nvlsPeer, c,
resources->mcBuff + (r*2*nChannels+c)*(buffSize+memSize),
resources->mcBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize),
resources->ucBuff + (r*2*nChannels+c)*(buffSize+memSize),
resources->ucBuff + ((r*2+1)*nChannels+c)*(buffSize+memSize));*/
}
}
free(nvlsShareableHandle);
return res;
cleanup:
comm->nvlsSupport = 0;
free(nvlsShareableHandle);
return res;
}
ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
struct nvlsResources* resources = (struct nvlsResources*)comm->nvlsResources;
if (resources == NULL) return ncclSuccess;
NCCLCHECK(nvlsGroupUnbind(comm, resources));
NCCLCHECK(nvlsGroupUnmapMem(comm, resources));
free(resources);
comm->nvlsResources = NULL;
return ncclSuccess;
}
#else
/*
* Pre CUDA 12.1 stubs
*/
ncclResult_t ncclNvlsSetup(struct ncclComm* comm) {
return ncclSuccess;
}
ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
return ncclSuccess;
}
#endif /* CUDA_VERSION >= 12010 */
+8 -8
View File
@@ -267,11 +267,11 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
if (intermediateRank == -1) {
info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
if (ncclParamP2pDirectDisable() == 0) send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
} else {
send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);
}
@@ -284,11 +284,11 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
if (useMemcpy) {
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
info->shmSize = resources->proxyInfo.shmSize;
memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
} else {
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
}
@@ -318,16 +318,16 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
if (intermediateRank == -1) {
info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
if (ncclParamP2pDirectDisable() == 0) recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
} else {
recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
}
} else {
info->rank = intermediateRank;
}
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc));
return ncclSuccess;
@@ -358,7 +358,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
// Send SIMPLE buff to proxy, and replace it by local buffer
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
} else {
send->conn.tail = &remDevMem->tail;
+2 -2
View File
@@ -157,7 +157,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
if (useMemcpySend) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
NCCLCHECK(ncclProxyCallBlocking(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
send->conn.tail = &proxyInfo.ceRecvMem->tail;
send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
@@ -187,7 +187,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
if (useMemcpyRecv) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
NCCLCHECK(ncclProxyCallBlocking(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
recv->conn.tail = &proxyInfo.ceRecvMem->tail;
}