diff --git a/CMakeLists.txt b/CMakeLists.txt index 47bf2eed9a..2dbe38bfd4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,6 +111,7 @@ set(CC_SOURCES src/graph/connect.cc src/graph/tuning.cc src/graph/topo.cc + src/graph/xml.cc src/collectives/all_reduce.cc src/collectives/all_gather.cc src/collectives/reduce.cc @@ -122,6 +123,7 @@ set(CC_SOURCES src/misc/utils.cc src/misc/ibvwrap.cc src/misc/nvmlwrap_stub.cc + src/transport/coll_net.cc src/transport/net.cc src/transport/net_ib.cc src/transport/net_socket.cc diff --git a/Makefile b/Makefile index 1d39312f9c..caed3d42ac 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,5 @@ # # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. -# Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. # # See LICENSE.txt for license information # diff --git a/makefiles/common.mk b/makefiles/common.mk index 2e448268a6..ece18c7255 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/makefiles/version.mk b/makefiles/version.mk index 05abbc75c5..883e62575f 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 5 -NCCL_PATCH := 7 +NCCL_MINOR := 6 +NCCL_PATCH := 4 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index 9ecb25327a..db1698a757 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,5 @@ # -# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. -# Modifications Copyright (c) 2015-2020, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -12,9 +11,9 @@ include ../makefiles/version.mk INCEXPORTS := nccl.h nccl_net.h LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \ misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \ - transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \ + transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \ collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ - graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc + graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc ##### lib files LIBNAME := libnccl.so diff --git a/src/bootstrap.cc b/src/bootstrap.cc index 0b0e2c105a..11ffc35cf0 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -1,6 +1,5 @@ /************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/channel.cc b/src/channel.cc index 220a62b925..3527915a56 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -7,24 +7,32 @@ #include "channel.h" #include "param.h" +#include "graph.h" -NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES); +#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ +#define DEFAULT_BUFFER_SIZE_BYTES_ARM (1LL << 20) /* 1MiB */ + +NCCL_PARAM(Buffsize, "BUFFSIZE", -2); ncclResult_t initChannel(struct ncclComm* comm, int channelid) { struct ncclChannel* channel = comm->channels+channelid; channel->id = channelid; // Setup intermediate buffering - channel->buffSize = ncclParamBuffsize(); + int buffSize = ncclParamBuffsize(); + int cpuArch, cpuVendor, cpuModel; + NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); + channel->buffSize = buffSize != -2 ? buffSize : + cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES; // Ring index to user rank table. NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks)); NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks)); // Communication structures with peers. - NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks)); - NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks)); - for (size_t i=0; inRanks; ++i) { + NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network) + NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1)); + for (size_t i=0; inRanks+1; ++i) { channel->peers[i].send.comm = comm; channel->peers[i].recv.comm = comm; } @@ -43,9 +51,13 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { CUDACHECK(hipFree(channel->ring.devUserRanks)); // Free transport proxy resources - for (int r=0; rpeers+r; if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources)); + } + for (int r=0; rpeers+r; if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources)); } diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index 84ccfe5bac..42f3b6600d 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -72,6 +72,10 @@ template __attribute__((noinline)) __device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { } +template +__attribute__((noinline)) +__device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { } + template __attribute__((noinline)) __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) { @@ -135,6 +139,10 @@ template __attribute__((noinline)) __device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { } +template +__attribute__((noinline)) +__device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { } + #include "prims_ll128.h" template __attribute__((noinline)) @@ -200,3 +208,7 @@ __device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) { template __attribute__((noinline)) __device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { } + +template +__attribute__((noinline)) +__device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index 0ac8d86a91..c2abcfff37 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -163,6 +163,63 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { } while(0); } +template +__attribute__((noinline)) +__device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int nthreads = args->nThreads; + const int bid = args->bid; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + const ssize_t size = args->N; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + int chunkSize = args->lastChunkSize; + const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T); + const ssize_t loopSize = args->nChannels*chunkSize; + + if (loopSize > size) { + chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + } + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + if (blockIdx.x < args->nChannels) { // first half of the channels do reduce + struct ncclTree* tree = &channel->collTreeUp; + ncclPrimitives prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Up + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + prims.send(thisInput+offset, nelem); + } else { + prims.recvReduceSend(thisInput+offset, nelem); + } + } + } + + if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast + struct ncclTree* tree = &channel->collTreeDn; + ncclPrimitives prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Down + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + prims.send(thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + prims.recv(thisOutput+offset, nelem); + } else { + prims.recvCopySend(thisOutput+offset, nelem); + } + } + } +} + template __attribute__((noinline)) __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { @@ -298,6 +355,62 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { } while(0); } +template +__attribute__((noinline)) +__device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int nthreads = args->nThreads; + const int bid = args->bid; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + const ssize_t size = args->N; + ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); + const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T); + const ssize_t loopSize = args->nChannels*chunkSize; + + if (loopSize > size) { + chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + } + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + if (blockIdx.x < args->nChannels) { // first half of the channels do reduce + struct ncclTree* tree = &channel->collTreeUp; + ncclLLPrimitives LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Up + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + LLprims.send(thisInput+offset, nelem); + } else { + LLprims.recvReduceSend(thisInput+offset, nelem); + } + } + } + + if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast + struct ncclTree* tree = &channel->collTreeDn; + ncclLLPrimitives LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Down + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + LLprims.send(thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + LLprims.recv(thisOutput+offset, nelem); + } else { + LLprims.recvCopySend(thisOutput+offset, nelem); + } + } + } +} + #include "prims_ll128.h" template __attribute__((noinline)) @@ -437,3 +550,7 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) { } } } + +template +__attribute__((noinline)) +__device__ void ncclAllReduceCollNetLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index e08219d695..2a17a32d7d 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -73,6 +73,10 @@ template __attribute__((noinline)) __device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { } +template +__attribute__((noinline)) +__device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { } + template __attribute__((noinline)) __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) { @@ -122,6 +126,10 @@ template __attribute__((noinline)) __device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { } +template +__attribute__((noinline)) +__device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { } + #include "prims_ll128.h" template __attribute__((noinline)) @@ -171,3 +179,7 @@ __device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) { template __attribute__((noinline)) __device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { } + +template +__attribute__((noinline)) +__device__ void ncclBroadcastCollNetLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index e37d1a54c8..5b5f645813 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -1,4 +1,3 @@ -#include "hip/hip_runtime.h" /************************************************************************* * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. @@ -51,7 +50,8 @@ static inline __device__ void exitIfAbortBarrier(int abort) { #define NCCL_FUNC4(coll, op, dtype) \ NCCL_FUNC5(coll##Tree, op, dtype), \ - NCCL_FUNC5(coll##Ring, op, dtype) + NCCL_FUNC5(coll##Ring, op, dtype), \ + NCCL_FUNC5(coll##CollNet, op, dtype) // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(coll, op) \ @@ -133,24 +133,30 @@ struct Caller{ inline __device__ void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept { - if (c->funcIndex < 240) { - if (c->funcIndex % 6 == 0) ncclBroadcastTreeLL_copy_i8(&c->args); - else if (c->funcIndex % 6 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args); - else if (c->funcIndex % 6 == 2) ncclBroadcastTree_copy_i8(&c->args); - else if (c->funcIndex % 6 == 3) ncclBroadcastRingLL_copy_i8(&c->args); - else if (c->funcIndex % 6 == 4) ncclBroadcastRingLL128_copy_i8(&c->args); - else ncclBroadcastRing_copy_i8(&c->args); + if (c->funcIndex < 360) { + if (c->funcIndex % 9 == 0) ncclBroadcastTreeLL_copy_i8(&c->args); + else if (c->funcIndex % 9 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args); + else if (c->funcIndex % 9 == 2) ncclBroadcastTree_copy_i8(&c->args); + else if (c->funcIndex % 9 == 3) ncclBroadcastRingLL_copy_i8(&c->args); + else if (c->funcIndex % 9 == 4) ncclBroadcastRingLL128_copy_i8(&c->args); + else if (c->funcIndex % 9 == 5) ncclBroadcastRing_copy_i8(&c->args); + else if (c->funcIndex % 9 == 6) ncclBroadcastCollNetLL_copy_i8(&c->args); + else if (c->funcIndex % 9 == 7) ncclBroadcastCollNetLL128_copy_i8(&c->args); + else ncclBroadcastCollNet_copy_i8(&c->args); } - else if (c->funcIndex < 480) Caller<240, 480>::call(c); - else if (c->funcIndex < 720) { - if (c->funcIndex % 6 == 0) ncclAllGatherTreeLL_copy_i8(&c->args); - else if (c->funcIndex % 6 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args); - else if (c->funcIndex % 6 == 2) ncclAllGatherTree_copy_i8(&c->args); - else if (c->funcIndex % 6 == 3) ncclAllGatherRingLL_copy_i8(&c->args); - else if (c->funcIndex % 6 == 4) ncclAllGatherRingLL128_copy_i8(&c->args); - else ncclAllGatherRing_copy_i8(&c->args); + else if (c->funcIndex < 720) Caller<360, 720>::call(c); + else if (c->funcIndex < 1080) { + if (c->funcIndex % 9 == 0) ncclAllGatherTreeLL_copy_i8(&c->args); + else if (c->funcIndex % 9 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args); + else if (c->funcIndex % 9 == 2) ncclAllGatherTree_copy_i8(&c->args); + else if (c->funcIndex % 9 == 3) ncclAllGatherRingLL_copy_i8(&c->args); + else if (c->funcIndex % 9 == 4) ncclAllGatherRingLL128_copy_i8(&c->args); + else if (c->funcIndex % 9 == 5) ncclAllGatherRing_copy_i8(&c->args); + else if (c->funcIndex % 9 == 6) ncclAllGatherCollNetLL_copy_i8(&c->args); + else if (c->funcIndex % 9 == 7) ncclAllGatherCollNetLL128_copy_i8(&c->args); + else ncclAllGatherCollNet_copy_i8(&c->args); } - else Caller<720, 1200>::call(c); + else Caller<1080, 1800>::call(c); } static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) { @@ -274,7 +280,8 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm) { \ #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \ IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \ - IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) + IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) \ + IMPL_COLL4(coll##CollNet, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_COLLNET) #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \ diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index 477308b449..28e86c3ca9 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -346,14 +346,9 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t, template __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); } -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) -// Use UNROLL 4 for 2 SRCs, 2 for the rest -#define AUTOUNROLL (UNROLL*(2/MINSRCS)) -#else // Try to limit consecutive load/stores to 8. // Use UNROLL 8 when we have a single source and a single destination, 4 otherwise #define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS))) -#endif template __device__ void ReduceOrCopyMulti(const int tid, const int nthreads, diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu index 2158b6f262..f1da886908 100644 --- a/src/collectives/device/functions.cu +++ b/src/collectives/device/functions.cu @@ -20,7 +20,8 @@ NCCL_FUNC5(coll, op, dtype) \ #define NCCL_FUNC4(coll, op, dtype) \ NCCL_FUNC5(coll##Tree, op, dtype), \ - NCCL_FUNC5(coll##Ring, op, dtype) + NCCL_FUNC5(coll##Ring, op, dtype), \ + NCCL_FUNC5(coll##CollNet, op, dtype) // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(coll, op) \ diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index 8b2ac5a3ba..07dcedbb72 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -283,7 +283,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) { r.recvStep[i] = ROUNDUP(r.recvStep[i], SLICESPERCHUNK*SLICESTEPS); #if defined(RCCL_USE_DIRECT_BUFFER) r.recvDirectBuff[i] = NULL; - if (directBuff && LOAD(&conn->direct)) { + if (directBuff && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) { r.recvDirectBuff[i] = directBuff; if (tid == 0) STORE(conn->ptrExchange, directBuff); } @@ -307,13 +307,13 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) { } } - __device__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) { + __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) { s.sendBuff[i] = (T*)LOAD(&conn->buff); s.sendStep[i] = LOAD(&conn->step); s.sendStep[i] = ROUNDUP(s.sendStep[i], SLICESPERCHUNK*SLICESTEPS); #if defined(RCCL_USE_DIRECT_BUFFER) s.sendDirectBuff[i] = NULL; - if (directBuff && LOAD(&conn->direct)) { + if (directBuff && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) { void* volatile* ptr = LOAD(&conn->ptrExchange); while ((s.sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL); barrier(); @@ -324,7 +324,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) { if (wid == i) s.sendConnTail = s.sendConnHead = s.sendStep[i]; // Make sure we set this after rounding up nsend++; } - __device__ void loadSendSync() { + __device__ __forceinline__ void loadSendSync() { if (tid < nsend) { s.sendConnHeadPtr = LOAD(&s.sendConn->head); s.sendConnHeadCache = LOAD(s.sendConnHeadPtr); @@ -336,7 +336,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) { } } - __device__ void saveRecvSync() { + __device__ __forceinline__ void saveRecvSync() { if (tid >= nthreads-WARP_SIZE && wid < nrecv) { STORE(&r.recvConn->step, r.recvConnHead); STORE(r.recvConn->opCountLoc, opCount+1); @@ -344,7 +344,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) { } } - __device__ void saveSendSync() { + __device__ __forceinline__ void saveSendSync() { if (tid < nsend) { STORE(&s.sendConn->step, s.sendConnHead); STORE(s.sendConn->opCountLoc, opCount+1); diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index d000cf4206..437bef9336 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -53,6 +53,10 @@ template __attribute__((noinline)) __device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { } +template +__attribute__((noinline)) +__device__ void ncclReduceCollNetKernel(struct CollectiveArgs* args) { } + template __attribute__((noinline)) __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) { @@ -99,6 +103,10 @@ template __attribute__((noinline)) __device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { } +template +__attribute__((noinline)) +__device__ void ncclReduceCollNetLLKernel(struct CollectiveArgs* args) { } + #include "prims_ll128.h" template __attribute__((noinline)) @@ -145,3 +153,7 @@ __device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) { template __attribute__((noinline)) __device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { } + +template +__attribute__((noinline)) +__device__ void ncclReduceCollNetLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index 4db35e0beb..3ddaa974d7 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -67,6 +67,10 @@ template __attribute__((noinline)) __device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { } +template +__attribute__((noinline)) +__device__ void ncclReduceScatterCollNetKernel(struct CollectiveArgs* args) { } + template __attribute__((noinline)) __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) { @@ -127,6 +131,10 @@ template __attribute__((noinline)) __device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { } +template +__attribute__((noinline)) +__device__ void ncclReduceScatterCollNetLLKernel(struct CollectiveArgs* args) { } + #include "prims_ll128.h" template __attribute__((noinline)) @@ -189,3 +197,7 @@ __device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) { template __attribute__((noinline)) __device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { } + +template +__attribute__((noinline)) +__device__ void ncclReduceScatterCollNetLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/debug.cc b/src/debug.cc index c3c0dfc9f2..46922d4e62 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -108,7 +108,6 @@ void ncclDebugInit() { if (debugFn[0] != '\0') { FILE *file = fopen(debugFn, "w"); if (file != NULL) { - INFO(NCCL_ALL,"DEBUG file is '%s'", debugFn); ncclDebugFile = file; } } @@ -126,7 +125,7 @@ void ncclDebugInit() { */ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { if (ncclDebugLevel == -1) ncclDebugInit(); - if (ncclDebugNoWarn == 1 && level == NCCL_LOG_WARN) level = NCCL_LOG_INFO; + if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; } char hostname[1024]; getHostName(hostname, 1024, '.'); @@ -136,7 +135,6 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file char buffer[1024]; size_t len = 0; pthread_mutex_lock(&ncclDebugLock); - if (ncclDebugNoWarn && ncclDebugLevel == NCCL_LOG_WARN) printf("WARN -> INFO\n"); if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN) len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line); diff --git a/src/enqueue.cc b/src/enqueue.cc index 9ac1832e3d..d9205ac516 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -7,6 +7,7 @@ #include "enqueue.h" #include "argcheck.h" +#include "coll_net.h" // Only generate inline kernels for LL #define NCCL_FUNC5(coll, op, dtype) \ @@ -16,7 +17,8 @@ #define NCCL_FUNC4(coll, op, dtype) \ NCCL_FUNC5(coll##Tree, op, dtype), \ - NCCL_FUNC5(coll##Ring, op, dtype) + NCCL_FUNC5(coll##Ring, op, dtype), \ + NCCL_FUNC5(coll##CollNet, op, dtype) // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(coll, op) \ @@ -195,7 +197,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args))); } // Start the network proxies as soon as the kernel has been launched. We can't - // perform any CUDA call between the two or having a hipFree between the CUDA + // perform any CUDA call between the two or having a cudaFree between the CUDA // launch and the transportStartProxy call could cause a deadlock. // Also, starting the proxies after the CUDA launch seems to be better for // performance (latency). @@ -227,35 +229,23 @@ ncclResult_t ncclEnqueueEvents(ncclComm_t comm) { /* Enqueueing system : computation of kernel and proxy operations parameters */ /*****************************************************************************/ -// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction -// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB. -static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = { - { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .84, .49, .42, .60, .75, .87, .94, .94, .99, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }, - { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .84, .49, .42, .60, .75, .87, .94, .94, .99, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }, - { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .41, .27, .25, .39, .46, .72, .76, .87, .92, .97, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 } -}; - -static float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = { - { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .25, .41, .55, .56, .78, .94, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }, - { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .25, .41, .55, .56, .78, .94, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }, - { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .04, .08, .09, .09, .11, .13, .25, .40, .59, .76, .86, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 } -}; - static ncclResult_t getAlgoInfo(struct ncclInfo* info) { struct ncclComm* comm = info->comm; - float minTime = 3600000.0; // Hopefully no operation will take an hour to complete. + float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete. // Find algorithm / protocol. info->algorithm = -1; info->protocol = -1; - for (int a=0; acomm->collNetSupport) + NCCLCHECK(collNetReduceSupport(info->datatype, info->op, &collNetTypeSupport)); + if (collNetTypeSupport != 1) nAlgos--; + for (int a=0; abandwidths[info->coll][a][p]; - if (bw == 0) continue; - int logSize = log2i(info->nBytes>>6); - if (a == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[p][logSize]; - else if (a == NCCL_ALGO_RING && logSize < 22) bw *= ringCorrectionFactor[p][logSize]; - float time = comm->latencies[info->coll][a][p] + (info->nBytes) / (1000 * bw); - if (time < minTime) { + float time; + NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, &time)); + if (time >= 0 && time < minTime) { info->algorithm = a; info->protocol = p; minTime = time; @@ -266,15 +256,14 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) { WARN("Error : no algorithm/protocol available"); return ncclInternalError; } - - if (comm->rank == 0) INFO(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %d", info->nBytes, info->algorithm, info->protocol, (int)minTime); + //if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime); TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime); - int nc = comm->nChannels; - int nt = comm->maxThreads[info->protocol]; + int nc = (info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down + int nt = comm->maxThreads[info->algorithm][info->protocol]; int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol]; while (info->nBytes < nc*nt*threadThreshold) { - if (nc >= 2) nc--; + if (info->algorithm != NCCL_ALGO_COLLNET && nc >= 2) nc--; #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) // do not reduce threads count on VEGA #else @@ -301,7 +290,7 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) { case ncclCollAllGather: info->pattern = ncclPatternRing; break; case ncclCollAllReduce: - info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break; + info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break; default: WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm); return ncclInternalError; @@ -316,6 +305,8 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { case ncclPatternTreeUpDown: case ncclPatternPipelineFrom: case ncclPatternPipelineTo: + case ncclPatternCollTreeUp: + case ncclPatternCollTreeDown: info->nstepsPerLoop = info-> nchunksPerLoop = 1; break; case ncclPatternRing: info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break; @@ -360,6 +351,13 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo } // Use lastChunkSize as chunkSize coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); + } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) { + // Optimize chunkSize / nSteps + while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2; + while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2; + while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2; + // Use lastChunkSize as chunkSize + coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->protocol == NCCL_PROTO_LL) { int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t); const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; @@ -384,6 +382,8 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo proxyArgs->chunkSteps = chunkSteps; proxyArgs->protocol = info->protocol; proxyArgs->opCount = info->comm->opCount; + proxyArgs->dtype = info->datatype; + proxyArgs->redOp = info->op; TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p", coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads, nLoops, proxyArgs->nsteps, info->comm); @@ -410,8 +410,11 @@ static ncclResult_t saveKernel(struct ncclInfo* info) { WARN("Error : mixing different streams within a group call is not supported."); return ncclInvalidUsage; } - for (int bid=0; bidcomm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels); + + int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1; + for (int bid=0; bidcomm->myParams->gridDim.x % info->comm->nChannels; + struct ncclChannel* channel = info->comm->channels+channelId; if (channel->collCount == NCCL_MAX_OPS) { WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS); @@ -420,6 +423,10 @@ static ncclResult_t saveKernel(struct ncclInfo* info) { // Proxy proxyArgs.channel = channel; + // Adjust pattern for CollNet based on channel index + if (nSubChannels == 2) { + info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown; + } NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks)); info->comm->myParams->gridDim.x++; @@ -431,7 +438,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) { memcpy(c, &coll, sizeof(struct ncclColl)); - c->args.bid = bid; + c->args.bid = bid % coll.args.nChannels; STORE(&c->active, 1); opIndex = (opIndex+1)%NCCL_MAX_OPS; c->nextIndex = opIndex; diff --git a/src/graph/connect.cc b/src/graph/connect.cc index a325361f6f..6db2334183 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -15,7 +15,7 @@ /******************************************************************/ ncclResult_t ncclTopoPreset(struct ncclComm* comm, - struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, + struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph, struct ncclTopoRanks* topoRanks) { int rank = comm->rank; int localRanks = comm->localRanks; @@ -28,9 +28,14 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, for (int i=0; itreeUp.down[i] = -1; channel->treeDn.up = -1; for (int i=0; itreeDn.down[i] = -1; + channel->collTreeUp.up = -1; + for (int i=0; icollTreeUp.down[i] = -1; + channel->collTreeDn.up = -1; + for (int i=0; icollTreeDn.down[i] = -1; int* ringIntra = ringGraph->intra+c*localRanks; int* treeIntra = treeGraph->intra+c*localRanks; + int* collNetIntra = collNetGraph->intra+c*localRanks; for (int i=0; itreeUp.down[0] = sym ? channel->treeDn.down[0] : channel->treeDn.up ; channel->treeUp.up = sym ? channel->treeDn.up : channel->treeDn.down[0]; } + if (collNetIntra[i] == rank) { + int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks; + + // CollTrees are always symmetric, i.e. + // up/down go in reverse directions + channel->collTreeDn.up = collNetIntra[prev]; + channel->collTreeDn.down[0] = collNetIntra[next]; + channel->collTreeUp.down[0] = channel->collTreeDn.down[0]; + channel->collTreeUp.up = channel->collTreeDn.up; + } } topoRanks->ringPrev[c] = channel->ring.prev; topoRanks->ringNext[c] = channel->ring.next; @@ -175,6 +190,40 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* tr return ncclSuccess; } +ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank) { + int nranks = comm->nRanks; + int depth = nranks/comm->nNodes; + int sendIndex = collNetGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern + int sendEndIndex = (sendIndex+comm->localRanks-1)%comm->localRanks; + for (int c=0; cnChannels/2; c++) { + struct ncclChannel* channel = comm->channels+c; + // Set root of collTree to id nranks + if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master + channel->collTreeUp.up = channel->collTreeDn.up = nranks; + } + if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain + channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1; + } + channel->collTreeUp.depth = channel->collTreeDn.depth = depth; + INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTreeUp.up, channel->collTreeUp.down[0]); + } + int recvIndex = 0; // recv GPU index is always 0 + int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks; + for (int c=0; cnChannels/2; c++) { + struct ncclChannel* channel = comm->channels+comm->nChannels/2+c; + // Set root of collTree to id nranks + if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master + channel->collTreeUp.up = channel->collTreeDn.up = nranks; + } + if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain + channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1; + } + channel->collTreeUp.depth = channel->collTreeDn.depth = depth; + INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTreeDn.up, channel->collTreeDn.down[0]); + } + return ncclSuccess; +} + // Legacy naming NCCL_PARAM(MinNrings, "MIN_NRINGS", -2); NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2); diff --git a/src/graph/paths.cc b/src/graph/paths.cc index eba1964f44..1dc8eb3979 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -1,5 +1,6 @@ /************************************************************************* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -42,7 +43,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath)); basePath->count = 0; basePath->width = LOC_WIDTH; - basePath->type = LINK_LOC; + basePath->type = PATH_LOC; while (nodeList.count) { nextNodeList.count = 0; @@ -58,7 +59,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT } struct ncclTopoLinkList* remPath; NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath)); - int width = std::min(path->width, link->width); + float width = std::min(path->width, link->width); if (remPath->width < width) { // Find reverse link for (int l=0; lnlinks; l++) { @@ -68,8 +69,8 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT } } if (remPath->list[0] == NULL) { - WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d", - remNode->id, remNode->type, remNode->nlinks, node->id, node->type); + WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx", + remNode->type, remNode->id, remNode->nlinks, node->type, node->id); return ncclInternalError; } // Copy the rest of the path @@ -77,9 +78,17 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT remPath->count = path->count + 1; remPath->width = width; - // Consider the path is QPI when going through the CPU - // Also don't consider LINK_NET as we only care about the NIC->GPU path. - int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type; + // Start with path type = link type. PATH and LINK types are supposed to match. + // Don't consider LINK_NET as we only care about the NIC->GPU path. + int type = link->type == LINK_NET ? 0 : link->type; + // Differentiate between one and multiple PCI switches + if (type == PATH_PIX && (node->type == PCI || link->remNode->type == PCI) && remPath->count > 3) type = PATH_PXB; + // Consider a path going through the CPU as PATH_PHB + if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB; + // Ignore Power CPU in an NVLink path + if (path->type == PATH_NVL && type == PATH_SYS && link->remNode->type == CPU && + link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) type = 0; + remPath->type = std::max(path->type, type); // Add to the list for the next iteration if not already in the list @@ -117,9 +126,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id); offset = strlen(line); } - INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width); + INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].width); #else - sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type); + sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, topoPathTypeStr[node->paths[t][n].type]); offset = strlen(line); #endif } @@ -171,7 +180,7 @@ static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int // Update path characteristics srcNode->paths[t2][i2].count = l; - srcNode->paths[t2][i2].type = LINK_QPI; + srcNode->paths[t2][i2].type = std::max(srcNode->paths[CPU][c].type, cpuNode->paths[t2][i2].type); srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width); return ncclSuccess; } @@ -194,6 +203,131 @@ static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) } } +static const int levelsOldToNew[] = { PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS }; +ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv) { + if (*level == -1) { + int l = -1; + if (disableEnv) { + char* str = getenv(disableEnv); + if (str) { + int disable = strtol(str, NULL, 0); + if (disable == 1) l = 0; + } + } + if (l == -1) { + char* str = getenv(levelEnv); + if (str) { + for (int i=0; i= '0' && str[0] <= '9') { + int oldLevel = strtol(str, NULL, 0); + const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1; + if (oldLevel > maxOldLevel) oldLevel = maxOldLevel; + l = levelsOldToNew[oldLevel]; + } + } + } + if (l >= 0) INFO(NCCL_GRAPH, "%s set from environment to %s", levelEnv, topoPathTypeStr[l]); + *level = l >= 0 ? l : -2; + } + return ncclSuccess; +} + +int ncclTopoUserP2pLevel = -1; +ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p) { + *p2p = 0; + + // Get GPUs from topology + int g1, g2; + NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1)); + struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1; + if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) { + // GPU not found, we can't use p2p. + return ncclSuccess; + } + struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2; + + // In general, use P2P whenever we can. + int p2pLevel = PATH_SYS; + + // Don't use P2P through ARM CPUs + int arch, vendor, model; + NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); + if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB; + if (arch == NCCL_TOPO_CPU_ARCH_X86 && + vendor == NCCL_TOPO_CPU_VENDOR_INTEL && + model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB; + + // User override + NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL")); + if (ncclTopoUserP2pLevel != -2) p2pLevel = ncclTopoUserP2pLevel; + + // Compute the PCI distance and compare with the p2pLevel. + if (path->type <= p2pLevel) *p2p = 1; + + return ncclSuccess; +} + +NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); +int ncclTopoUserGdrLevel = -1; + +ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) { + *useGdr = 0; + + // Get GPU and NET + int n, g; + NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n)); + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g)); + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + + // Check that both the NIC and GPUs support it + if (net->net.gdrSupport == 0) return ncclSuccess; + if (gpu->gpu.gdrSupport == 0) return ncclSuccess; + + if (read) { // For reads (sends) only enable under certain conditions + int gdrReadParam = ncclParamNetGdrRead(); + if (gdrReadParam == 0) return ncclSuccess; +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + return ncclSuccess; +#else + if (gdrReadParam < 0) { + int nvlink = 0; + // Since we don't know whether there are other communicators, + // it's better to keep things local if we have a single GPU. + if (system->nodes[GPU].count == 1) nvlink = 1; + for (int i=0; inodes[GPU].count; i++) { + if (i == g) continue; + if (gpu->paths[GPU][i].type == PATH_NVL) { + nvlink = 1; + break; + } + } + if (!nvlink) return ncclSuccess; + } +#endif + } + + // Check if we are close enough that it makes sense to enable GDR + int netGdrLevel = PATH_PXB; + NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL")); + if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel; + int distance = gpu->paths[NET][n].type; + if (distance > netGdrLevel) { + INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel); + return ncclSuccess; + } + + *useGdr = 1; + INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read); + return ncclSuccess; +} + ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) { // Precompute paths between GPUs/NICs. @@ -210,26 +344,29 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer // Compute paths to GPU g NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system)); + // Update path when we don't want to / can't use GPU Direct P2P + for (int p=0; pnodes[GPU].count; p++) { + int p2p; + NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p)); + if (p2p == 0) { + // Divert all traffic through the CPU + int cpu; + NCCLCHECK(getLocalCpu(system, g, &cpu)); + NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g)); + } + } + if (peerInfos == NULL) continue; - // Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM - struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank; + // Remove GPUs we can't talk to because of containers. + struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank; for (int p=0; pnodes[GPU].count; p++) { if (p == g) continue; - struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank; - int p2p; - NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo)); - if (p2p == 0) { - int shm; - NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo)); - if (shm == 1) { - // We cannot use GPU Direct, so we need all traffic to go through a CPU - int cpu; - NCCLCHECK(getLocalCpu(system, g, &cpu)); - NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g)); - } else { - // We cannot communicate with that peer. - system->nodes[GPU].nodes[p].paths[GPU][g].count = 0; - } + struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank; + int shm; + NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo)); + if (shm == 0) { + // Mark this peer as inaccessible. We'll trim it later. + system->nodes[GPU].nodes[p].paths[GPU][g].count = 0; } } } @@ -239,11 +376,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer struct ncclTopoNode* netNode = system->nodes[NET].nodes+n; NCCLCHECK(ncclTopoSetPaths(netNode, system)); - if (peerInfos == NULL) continue; for (int g=0; gnodes[GPU].count; g++) { - if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) { - // We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths - // to go through a CPU + // Update path when we dont want to / can't use GPU Direct RDMA. + int gdr; + NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr)); + if (gdr == 0) { + // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU int localCpu; NCCLCHECK(getLocalCpu(system, g, &localCpu)); NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g)); @@ -251,7 +389,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer } } } - return ncclSuccess; } @@ -270,7 +407,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* domains[g] = std::min(domains[g], domains[p]); } } - if (gpu->rank == comm->rank) myDomain = domains[g]; + if (gpu->gpu.rank == comm->rank) myDomain = domains[g]; } int ngpus = system->nodes[GPU].count; @@ -288,98 +425,19 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* free(ids); return ncclInternalError; } - - // Remove GPUs I can't access (even indirectly) from my view of the node - for (int t=0; tnodes[t].count; n++) { - struct ncclTopoNode* node = system->nodes[t].nodes+n; - if (node == gpu) continue; - for (int l=0; lnlinks; l++) { - while (lnlinks && node->links[l].remNode == gpu) { - if (lnlinks-1) - memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink)); - node->nlinks--; - } - if (lnlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) { - node->links[l].remNode--; - } - } - } - } - if (g != system->nodes[GPU].count-1) - memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode)); - system->nodes[GPU].count--; + NCCLCHECK(ncclTopoRemoveNode(system, GPU, g)); } comm->localRanks = system->nodes[GPU].count; if (system->nodes[GPU].count == comm->nRanks) { - // Trim network - ncclTopoRemovePathType(system, NET); - system->nodes[NET].count = 0; - for (int t=0; tnodes[t].count; n++) { - struct ncclTopoNode* node = system->nodes[t].nodes+n; - for (int l=0; lnlinks; l++) { - struct ncclTopoLink* link = &(node->links[l]); - if (link->remNode->type == NET) { - // Remove the link - for (int i=l; i<(node->nlinks-1); i++) { - memcpy(&(node->links[i]), &(node->links[i+1]), sizeof(ncclTopoLink)); - } - node->nlinks--; - l--; // revisit the same value of "l" for the next iteration, since we edited the list in the middle of the loop - } - } - } - } + for (int n=system->nodes[NET].count-1; n>=0; n--) + NCCLCHECK(ncclTopoRemoveNode(system, NET, n)); } free(domains); free(ids); return ncclSuccess; } -static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) { - int nvlSpeed = 0; - int nvlPeers = 0; - int pciSpeed = 0; - for (int l=0; lnlinks; l++) { - if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width; - if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2; - if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width; - } - *speed = std::min(*speed, std::max(nvlSpeed, pciSpeed)); - return ncclSuccess; -} - -ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) { - // Compute max speed to try to accelerate the search. - system->maxSpeed = LOC_WIDTH; - - for (int g=0; gnodes[GPU].count; g++) { - NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed)); - } - if (system->nodes[NET].count) { - // Try to assign one NIC per GPU - int netMaxSpeed = 0; - int netMaxSpeedCount = 0; - for (int n=0; nnodes[NET].count; n++) { - int maxSpeed = 0; - struct ncclTopoNode* net = system->nodes[NET].nodes+n; - for (int g=0; gnodes[GPU].count; g++) { - maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width); - } - if (maxSpeed > netMaxSpeed) { - netMaxSpeed = maxSpeed; - netMaxSpeedCount = 1; - } else if (maxSpeed == netMaxSpeed) { - netMaxSpeedCount++; - } - } - system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH); - } - return ncclSuccess; -} - void ncclTopoFree(struct ncclTopoSystem* system) { for (int t=0; t -static ncclResult_t ncclTopoFollowPath(struct ncclTopoGraph* graph, struct ncclTopoLinkList* path, struct ncclTopoNode** node, int width, int typeSave) { - if (path->count == 0) return ncclSuccess; - - *node = NULL; - if (width > 0) { - if (path->type > graph->type) return ncclSuccess; - graph->type = std::max(graph->type, path->type); - graph->nHops += path->count; - } else { - graph->type = typeSave; - graph->nHops -= path->count; +// Initialize system->maxWidth. This is the per-channel (i.e. per-SM) +// max speed. +static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + float nvLinkWidth = VEGA_XGMI_WIDTH; +#else + float nvLinkWidth = gpu->gpu.cudaCompCap > 60 ? VOLTA_NVLINK_WIDTH : PASCAL_NVLINK_WIDTH; +#endif + float maxWidth = 0.0; + for (int i=0; inodes[type].count; i++) { + struct ncclTopoLinkList* path = gpu->paths[type]+i; + float width = path->width; + if (path->count == 0) continue; + if (path->type == PATH_NVL) width = std::min(nvLinkWidth, width); + maxWidth = std::max(maxWidth, width); } + return maxWidth; +} +ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) { + system->maxWidth = 0.0; + int inter = system->nodes[NET].count; + if (inter == 0 && system->nodes[GPU].count == 1) { + system->maxWidth = LOC_WIDTH; + return ncclSuccess; + } + for (int g=0; gnodes[GPU].count; g++) { + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU)); + } + return ncclSuccess; +} - for (int i=0; icount; i++) { - if (path->list[i]->width < width) { - // Can't follow this path, rewind and exit - for (int j=0; jlist[j]->width += width; +static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, struct ncclTopoLink** revLink) { + for (int l=0; lnlinks; l++) { + struct ncclTopoLink* link = node2->links+l; + if (link->remNode == node1) { + *revLink = link; return ncclSuccess; } - path->list[i]->width -= width; } - *node = path->list[path->count-1]->remNode; + WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id); + return ncclInternalError; +} + +// This is unfortunately needed since manipulating floats often results in rounding errors. +#define SUB_ROUND(a, b) (a = roundf((a-b)*1000)/1000) + +static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float speed, int* steps) { + float pciSpeed = speed; + for (int step=0; stepcount; step++) { + struct ncclTopoNode* node = path->list[step]->remNode; + if (node->type == CPU) { + // Account for P2P inefficiency through Intel CPU RC + if (path->type == PATH_PHB && start->type == GPU && + node->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && + node->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { + pciSpeed = INTEL_P2P_OVERHEAD(speed); + } + } + } + + struct ncclTopoNode* node = start; + for (int step=0; steplist[step]; + struct ncclTopoLink* revLink = NULL; + float fwSpeed = link->type == LINK_PCI ? pciSpeed : speed; + float revSpeed = 0; + if (link->remNode->type == GPU && start->type != GPU) { + if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink)); + revSpeed += fwSpeed/8; + } + if (link->remNode->type == CPU && link->type == LINK_NVL) { + if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink)); + revSpeed += fwSpeed; + } + if (link->width < fwSpeed || (revSpeed && revLink->width < revSpeed)) { *steps = step; return ncclSuccess; } + SUB_ROUND(link->width, fwSpeed); + if (revSpeed) SUB_ROUND(revLink->width, revSpeed); + node = link->remNode; + } + *steps = maxSteps; + return ncclSuccess; +} + +// Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1). +static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct ncclTopoNode** node) { + // First handle easy cases + *node = system->nodes[type2].nodes+index2; + if (type1 == -1) return ncclSuccess; + struct ncclTopoNode* node1 = system->nodes[type1].nodes+index1; + struct ncclTopoLinkList* path = node1->paths[type2]+index2; + if (path->count == 0 ) return ncclSuccess; + + // Now check link type + *node = NULL; + int intra = type1 == GPU && type2 == GPU; + float speed = intra ? graph->speedIntra : graph->speedInter; + int type = intra ? graph->typeIntra : graph->typeInter; + + if (mult == 1 && (path->type > type)) return ncclSuccess; + + speed *= mult; + + // Check there is enough bandwidth on paths. + int step = 0; + NCCLCHECK(followPath(path, node1, path->count, speed, &step)); + if (step < path->count) goto rewind; + + // Enough bandwidth : return destination node. + graph->nHops += mult*path->count; + *node = system->nodes[type2].nodes+index2; + return ncclSuccess; + +rewind: + // Not enough bandwidth : rewind and exit. + NCCLCHECK(followPath(path, node1, step, -speed, &step)); return ncclSuccess; } @@ -81,22 +177,42 @@ static int cmpIntraScores(struct ncclGpuScore* scores, int count) { return 0; } -static ncclResult_t getNetPaths(struct ncclTopoSystem* system, const uint64_t flag, struct ncclTopoLinkList** netPaths) { - for (int n=0; nnodes[NET].count; n++) { - if (system->nodes[NET].nodes[n].used & flag) { - *netPaths=system->nodes[NET].nodes[n].paths[GPU]; +static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* index) { + for (int g=0; gnodes[GPU].count; g++) { + if (system->nodes[GPU].nodes[g].gpu.rank == rank) { + *index = g; return ncclSuccess; } } + WARN("Could not find gpu rank %d\n", rank); return ncclInternalError; } +static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int* index) { + for (int n=0; nnodes[NET].count; n++) { + if (system->nodes[NET].nodes[n].id == id) { + *index = n; + return ncclSuccess; + } + } + WARN("Could not find net id %lx\n", id); + return ncclInternalError; +} + +static ncclResult_t getNetPaths(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoLinkList** netPaths) { + int netId = graph->inter[graph->nChannels*2]; + int n; + NCCLCHECK(getNetIndex(system, netId, &n)); + *netPaths=system->nodes[NET].nodes[n].paths[GPU]; + return ncclSuccess; +} + ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) { const uint64_t flag = 1ULL<<(graph->nChannels); int ngpus = system->nodes[GPU].count; struct ncclTopoLinkList* paths = gpu->paths[GPU]; struct ncclTopoLinkList* netPaths = NULL; - if (sortNet) NCCLCHECK(getNetPaths(system, flag, &netPaths)); + if (sortNet) NCCLCHECK(getNetPaths(system, graph, &netPaths)); struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES]; memset(scores, 0, ngpus*sizeof(struct ncclGpuScore)); @@ -131,9 +247,13 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc return ncclSuccess; } -ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time); +ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time); -#define NCCL_SEARCH_TIMEOUT (1ULL<<20) // This should get contain all search within a second or so. +// Try to keep all searchs within one second +#define NCCL_SEARCH_GLOBAL_TIMEOUT (3ULL<<19) +#define NCCL_SEARCH_TIMEOUT (1<<18) +#define NCCL_SEARCH_TIMEOUT_TREE (1<<17) +#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<10) #define FORCED_ORDER_PCI 1 #define FORCED_ORDER_REPLAY 2 @@ -143,7 +263,7 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo if (graph->nChannels == 0) return ncclInternalError; int ngpus = system->nodes[GPU].count; int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1]; - for (int i=0; inodes[GPU].nodes[i].rank == nextRank) { + for (int i=0; inodes[GPU].nodes[i].gpu.rank == nextRank) { *g = i; return ncclSuccess; } @@ -151,44 +271,37 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo return ncclSuccess; } -ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time); +ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time); -ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoLinkList* paths, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time, int g, int speed) { - int typeSave = graph->type; +ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time, int type, int index, int g) { const uint64_t flag = 1ULL<<(graph->nChannels); - struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; - if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, speed, typeSave)); + struct ncclTopoNode* gpu; + NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu)); if (gpu) { gpu->used ^= flag; - NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, maxSpeed, time)); + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, time)); gpu->used ^= flag; - if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, -speed, typeSave)); + NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, -1, &gpu)); } return ncclSuccess; } ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) { - // 0. When we are trying to increase speedIntra, do not copy if the solution has less channels - // since it would likely impact the rings algorithms too. - if (graph->speedIntra > graph->speedInter && graph->nChannels < refGraph->nChannels) return ncclSuccess; + // 1. Constraint to get the same nChannels between Rings and Trees + if (graph->nChannels < graph->minChannels) return ncclSuccess; - // 1. Try to get better bandwidth + // 2. Try to get better bandwidth if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess; if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) { *copy = 1; return ncclSuccess; } - // 2. Give an advantage when all channels are the same - if (graph->nChannels > 1 && graph->sameChannels && refGraph->sameChannels == 0) { - *copy = 1; - return ncclSuccess; - } - // 3. Less hops - if (graph->nHops < refGraph->nHops) *copy = 1; + // 3. Less hops (but not at the price of going cross NICs) + if (graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1; return ncclSuccess; } -ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time) { +ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) { if ((*time) <= 0) return ncclSuccess; (*time)--; @@ -196,55 +309,43 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo if (step == ngpus) { // Determine whether we found a better solution or not int copy = 0; - int sameChannels = graph->sameChannels; - if (graph->nChannels > 0) { - int* intra = graph->intra+graph->nChannels*ngpus; - for (int g=0; gsameChannels = 0; - } graph->nChannels++; NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, ©)); if (copy) { memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph)); - if (graph->nChannels*graph->speedIntra == maxSpeed) *time = -1; + if (graph->nChannels == graph->maxChannels) *time = -1; } - if (graph->nChannels < MAXCHANNELS/2) { - NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, maxSpeed, time)); + if (graph->nChannels < graph->maxChannels) { + NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, time)); } graph->nChannels--; - graph->sameChannels = sameChannels; return ncclSuccess; } - graph->intra[graph->nChannels*ngpus+step] = gpu->rank; + graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank; + int g = gpu - system->nodes[GPU].nodes; if (step == backToNet) { // first get back to NIC if (system->nodes[NET].count) { - int maxWidth = 0; - struct ncclTopoLinkList* paths = gpu->paths[NET]; + int startNetIndex; + NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex)); + struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; for (int n=0; nnodes[NET].count; n++) { - if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue; - maxWidth = std::max(paths[n].width, maxWidth); - } - for (int n=0; nnodes[NET].count; n++) { - if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue; - if (paths[n].width == maxWidth) { - struct ncclTopoNode* net = system->nodes[NET].nodes+n; - int typeSave = graph->type; - NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, graph->speedInter, typeSave)); - if (net) { - graph->inter[graph->nChannels*2+1] = net->id; - NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, maxSpeed, time)); - NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, -graph->speedInter, typeSave)); - } + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue; + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net)); + if (net) { + graph->inter[graph->nChannels*2+1] = net->id; + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, time)); + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net)); } } } } else if (step < system->nodes[GPU].count-1) { // Go to next GPU - struct ncclTopoLinkList* paths = gpu->paths[GPU]; int next[NCCL_TOPO_MAX_NODES]; int count; if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order - next[0] = step+1; + next[0] = (busIdToCudaDev(gpu->id)+1)%system->nodes[GPU].count; count = 1; } else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next)); @@ -253,64 +354,64 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 )); } for (int i=0; invlink; - graph->nvlink &= paths[g].type <= LINK_NVL ? 1 : 0; - int speed = graph->speedIntra; - if (paths[g].type == LINK_QPI) speed = INTEL_P2P_OVERHEAD(speed); - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, step+1, backToNet, backToFirstRank, forcedOrder, maxSpeed, time, g, speed)); - graph->nvlink = nvlink; + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, step+1, backToNet, backToFirstRank, forcedOrder, time, GPU, g, next[i])); } } else if (step == backToFirstRank) { // Find first GPU and loop back to it - int g; - int rank = graph->intra[graph->nChannels*ngpus]; - for (g=0; gnodes[GPU].nodes[g].rank == rank) break; - } - if (g == ngpus) { - WARN("Could not find GPU with rank %d\n", rank); - return ncclInternalError; - } - struct ncclTopoLinkList* paths = gpu->paths[GPU]; - struct ncclTopoNode* firstGpu = system->nodes[GPU].nodes+g; - int typeSave = graph->type; - NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, graph->speedIntra, typeSave)); + int p; + NCCLCHECK(getGpuIndex(system, graph->intra[graph->nChannels*ngpus], &p)); + struct ncclTopoNode* firstGpu; + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, 1, &firstGpu)); if (firstGpu) { - NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, maxSpeed, time)); - NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, -graph->speedIntra, typeSave)); + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, time)); + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, -1, &firstGpu)); } } else { // Next path - NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, maxSpeed, time)); + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time)); } return ncclSuccess; } -ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int maxSpeed, int* time) { - const uint64_t flag = 1ULL<<(graph->nChannels); +ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) { const int speed = graph->speedInter; for (int n=0; nnodes[NET].count; n++) { struct ncclTopoNode* net = system->nodes[NET].nodes+n; struct ncclTopoNode* gpu; - if (net->used == 0) { - graph->inter[graph->nChannels*2] = net->id; - for (int i=0; inodes[NET].count; i++) { - if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag; - } - struct ncclTopoLinkList* paths = net->paths[GPU]; + if (graph->collNet && net->net.collSupport == 0) continue; + if (net->net.width < speed) continue; + if (net->net.maxChannels == 0) continue; - // First try the PCI order to set a reference - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, speed)); - // Then try to replay the last channel - if (graph->nChannels > 0) { - int g; - NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, g, speed)); + graph->inter[graph->nChannels*2] = net->id; + for (int i=0; inodes[NET].count; i++) { + if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) && + (system->nodes[NET].nodes[i].net.port == net->net.port)) { + system->nodes[NET].nodes[i].net.width -= speed; + } + } + net->net.maxChannels--; + + // First try to replay the last channel + if (graph->nChannels > 0) { + int g; + NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); + } + if (graph->nChannels == 0 || graph->sameChannels == 0) { + if (graph->nChannels == 0) { + // Always try the PCI order first to set a reference + struct ncclTopoLinkList* paths = net->paths[GPU]; + // find the first GPU that is closest to NIC + int f = 0; + for (int i = 0; inodes[GPU].count; i++) + if (paths[i].count < paths[f].count) f = i; + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, NET, n, f)); } // Then try the most local GPUs - int maxWidth = 0, minHops = 0xfffffff; + float maxWidth = 0; + int minHops = 0xfffffff; + struct ncclTopoLinkList* paths = net->paths[GPU]; for (int g=0; gnodes[GPU].count; g++) { if (paths[g].width > maxWidth) { maxWidth = paths[g].width; @@ -329,14 +430,19 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo gpu = system->nodes[GPU].nodes+g; int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1; if (tryGpuBidir == gpuUsed) { - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, 0, maxSpeed, time, g, speed)); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); } } } } } - for (int i=0; inodes[NET].count; i++) { - if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag; + } + + net->net.maxChannels++; + for (int i=0; inodes[NET].count; i++) { + if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) && + (system->nodes[NET].nodes[i].net.port == net->net.port)) { + system->nodes[NET].nodes[i].net.width += speed; } } } @@ -375,17 +481,152 @@ ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, in return ncclSuccess; } -ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time) { +ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time) { int backToNet, backToFirstRank; NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank)); if (system->nodes[NET].count) { // Start from NET - ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, maxSpeed, time); + ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time); } else { - // Start from GPU 0 - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, graph->speedIntra)); - if (graph->nChannels > 0) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, 0, graph->speedIntra)); - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, 0, maxSpeed, time, 0, graph->speedIntra)); + // Intra-node only. + if (graph->nChannels == 0) { + // Try PCI order first + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0)); + } else { + // Also try to replay previous channel + int g; + NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, -1, -1, g)); + } + if (graph->sameChannels == 0 || graph->nChannels == 0) { + // Finally, try all other possibilities unless we are forced to use the same channels + for (int g=0; gnodes[GPU].count; g++) { + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g)); + } + } + } + return ncclSuccess; +} + +/************************************/ +/* User defined graph from XML file */ +/************************************/ + +struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "LOC", PATH_LOC }, { NULL, 0 } }; +ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { + int ngpus = system->nodes[GPU].count; + int* inter = graph->inter+2*c; + int* intra = graph->intra+ngpus*c; + int n=0, g=0; + for (int s=0; snSubs; s++) { + struct ncclXmlNode* sub = xmlChannel->subs[s]; + int dev; + NCCLCHECK(xmlGetAttrInt(sub, "dev", &dev)); + if (strcmp(sub->name, "net") == 0) { + inter[n++] = dev; + } else if (strcmp(sub->name, "gpu") == 0) { + int rank = -1; + for (int g=0; gnodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank; + } + if (rank == -1) { + WARN("XML Import Channel : dev %d not found.", dev); + return ncclSystemError; + } + intra[g++] = rank; + } + } + return ncclSuccess; +} +ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { + int id; + NCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id)); + if (graph->id != id) return ncclSuccess; + + int crossNic; + NCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic)); + if (graph->crossNic == 0 && crossNic == 1) return ncclSuccess; + graph->crossNic = crossNic; + + NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern)); + NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels)); + NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->speedIntra)); + NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->speedInter)); + const char* str; + NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str)); + NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType)); + NCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str)); + NCCLCHECK(kvConvertToInt(str, &graph->typeInter, kvDictLinkType)); + NCCLCHECK(xmlGetAttrInt(xmlGraph, "samechannels", &graph->sameChannels)); + for (int s=0; snSubs; s++) { + NCCLCHECK(ncclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph)); + } + return ncclSuccess; +} +ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { + for (int s=0; snSubs; s++) { + NCCLCHECK(ncclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph)); + } + return ncclSuccess; +} + +/* And the reverse : graph->xml */ +ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) { + struct ncclXmlNode* xmlChannel; + int ngpus = system->nodes[GPU].count; + int* inter = graph->inter+2*c; + int* intra = graph->intra+ngpus*c; + NCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel)); + struct ncclXmlNode* node; + if (system->nodes[NET].count) { + NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node)); + NCCLCHECK(xmlSetAttrInt(node, "dev", inter[0])); + } + for (int g=0; gnodes[GPU].nodes[i].gpu.rank == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev; + } + if (dev == -1) { + WARN("XML Export Channel : rank %d not found.", intra[g]); + return ncclInternalError; + } + NCCLCHECK(xmlSetAttrInt(node, "dev", dev)); + } + if (system->nodes[NET].count) { + NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node)); + NCCLCHECK(xmlSetAttrInt(node, "dev", inter[1])); + } + return ncclSuccess; +} +ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) { + struct ncclXmlNode* xmlGraph; + NCCLCHECK(xmlAddNode(xml, parent, "graph", &xmlGraph)); + NCCLCHECK(xmlSetAttrInt(xmlGraph, "id", graph->id)); + NCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern)); + NCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic)); + NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels)); + NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->speedIntra)); + NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->speedInter)); + const char* str; + NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType)); + NCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str)); + NCCLCHECK(kvConvertToStr(graph->typeInter, &str, kvDictLinkType)); + NCCLCHECK(xmlSetAttr(xmlGraph, "typeinter", str)); + NCCLCHECK(xmlSetAttrInt(xmlGraph, "samechannels", graph->sameChannels)); + for (int c=0; cnChannels; c++) { + NCCLCHECK(ncclTopoGetXmlFromChannel(graph, c, system, xml, xmlGraph)); + } + return ncclSuccess; +} +ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml) { + xml->maxIndex = 0; + struct ncclXmlNode* xmlGraphs; + NCCLCHECK(xmlAddNode(xml, NULL, "graphs", &xmlGraphs)); + NCCLCHECK(xmlSetAttrInt(xmlGraphs, "version", NCCL_GRAPH_XML_VERSION)); + for (int g=0; gnodes[GPU].nodes+i; if (node->paths[GPU] == NULL) continue; - int sum = ngpus*(ngpus-1)/2 - node->rank; + int sum = ngpus*(ngpus-1)/2 - node->gpu.rank; int count = 0; for (int n = 0; npaths[GPU][n].type != LINK_NVL) continue; - sum -= system->nodes[GPU].nodes[n].rank; + struct ncclTopoLink* link; + for (link = node->links; link->remNode; link++) { + if (link->remNode->gpu.rank == n) break; + } + if (!link->remNode) continue; + if (link->type != LINK_NVL) continue; + sum -= system->nodes[GPU].nodes[n].gpu.rank; count ++; } if(count != ngpus-2 || sum < 0 || sum > ngpus-1) { @@ -492,28 +738,39 @@ static void parseChordalRing(struct ncclTopoSystem* system, char **str) { return; } +float speedArray[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +#define NSPEEDS (sizeof(speedArray)/sizeof(float)) + ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0; graph->speedIntra = graph->speedInter = 0; if (graph->crossNic == 2) graph->crossNic = 0; - graph->nvlink = 0; - graph->type = LINK_LOC; + graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; + graph->typeInter = PATH_PIX; graph->nChannels = 0; graph->sameChannels = 1; - char* str = getenv("NCCL_GRAPH"); + char* str = getenv("NCCL_GRAPH_FILE"); + if (str) { + struct ncclXml* xml; + NCCLCHECK(ncclCalloc(&xml, 1)); + NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml)); + NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph)); + free(xml); + if (graph->nChannels > 0) return ncclSuccess; + } + if (!str) parseChordalRing(system, &str); if (str) { NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra)); for (int i=0; inChannels*ngpus; i++) { // Translate gpu numbers into ranks - graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].rank; + graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].gpu.rank; } // TODO : let user specify NICs graph->inter[0] = graph->inter[1] = 0; graph->speedIntra = graph->speedInter = system->maxWidth; - graph->nvlink = 0; if (graph->pattern == NCCL_TOPO_PATTERN_RING) { // Reverse the loop for (int c=0; cnChannels; c++) { @@ -531,22 +788,24 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph struct ncclTopoGraph tmpGraph; memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph)); - int bestSpeed = 0; // First try crossnic, then decrease speed and finally increase speedIntra. - tmpGraph.speedIntra = tmpGraph.speedInter = system->maxWidth; - int maxSpeed = system->maxSpeed; tmpGraph.pattern = graph->pattern; + int pass = 1; + int speedIndex = 0; + while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++; + tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex]; + int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT; search: - int time = NCCL_SEARCH_TIMEOUT; - int stepSpeed = system->maxWidth/4; - tmpGraph.nvlink = 1; + int time = tmpGraph.sameChannels ? NCCL_SEARCH_TIMEOUT_SAMECHANNELS : + tmpGraph.pattern == NCCL_TOPO_PATTERN_TREE ? NCCL_SEARCH_TIMEOUT_TREE : NCCL_SEARCH_TIMEOUT; tmpGraph.nChannels = 0; - tmpGraph.sameChannels = 1; - NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, maxSpeed, &time)); + globalTimeout -= time; + + NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time)); #if 0 - printf("Pattern %d, crossNic %d, Speed %d/%d, type %d -> nChannels %dx%d/%d %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.type, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : ""); + printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : ""); for (int c=0; cnChannels; c++) { printf("%2d : ", c); for (int g=0; gnChannels > 0) goto done; - if ((graph->nChannels > 0) && (bestSpeed == 0)) bestSpeed = graph->speedIntra; + // Optimal solution, stop here + if (graph->nChannels == graph->maxChannels && graph->speedInter == system->maxWidth) goto done; - if (tmpGraph.speedIntra == tmpGraph.speedInter) { - // First pass, we don't have a solution yet ; try to go slower. + if (pass == 1) { + // First pass, we don't have a solution yet ; try other options + + // Try having different channels + if (tmpGraph.sameChannels == 1) { + tmpGraph.sameChannels = 0; + goto search; + } + tmpGraph.sameChannels = 1; + + if (time != -1) globalTimeout += time; + else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT; + if (globalTimeout < 0) goto done; + + int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS; + if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) { + tmpGraph.typeIntra += 1; + goto search; + } + tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; + if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXB)) { + tmpGraph.typeInter += 1; + goto search; + } + tmpGraph.typeInter = PATH_PIX; // Try a simpler tree if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) { @@ -574,50 +854,61 @@ search: } tmpGraph.pattern = graph->pattern; - if (tmpGraph.type < LINK_QPI) { - tmpGraph.type += 1; - goto search; - } - tmpGraph.type = graph->type; - if (crossNic && tmpGraph.crossNic == 0) { // Try again with crossNic if permitted tmpGraph.crossNic = crossNic; goto search; } - tmpGraph.crossNic = graph->crossNic; + tmpGraph.crossNic = 0; + + // Decrease speed until we find a solution + if ((speedIndex < NSPEEDS-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) { + tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex]; + goto search; + } + speedIndex = 0; + while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++; + tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex]; - // Try to reduce speed per channel - tmpGraph.speedIntra = tmpGraph.speedInter -= stepSpeed; - if (tmpGraph.speedIntra >= bestSpeed/2 && tmpGraph.speedIntra >= stepSpeed) goto search; } done: - // We have a solution now. See if we can increase speedIntra - if (tmpGraph.speedIntra == tmpGraph.speedInter) { + // We have a solution. Start from that solution and move to pass 2. + if (pass == 1) { + time = -1; + memcpy(&tmpGraph, graph, sizeof(tmpGraph)); + speedIndex = 0; + while (speedArray[speedIndex] > graph->speedInter && speedIndex < NSPEEDS-1) speedIndex++; + tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex]; + tmpGraph.minChannels = graph->nChannels; + pass = 2; + } + + // 3. See if we can increase speedIntra for trees (2 nodes or collnet) + if (pass == 2) { + if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING && + tmpGraph.speedIntra == graph->speedIntra && tmpGraph.speedIntra < tmpGraph.speedInter*2 && + speedIndex > 0) { + tmpGraph.speedIntra = speedArray[--speedIndex]; + goto search; + } time = -1; memcpy(&tmpGraph, graph, sizeof(tmpGraph)); } - if (time != 0 && tmpGraph.pattern != NCCL_TOPO_PATTERN_RING && tmpGraph.speedIntra == graph->speedIntra) { - // Try to increase the intra speed only but keeping nChannels the same - tmpGraph.speedIntra += stepSpeed; - maxSpeed = tmpGraph.speedIntra * graph->nChannels; - if (tmpGraph.speedIntra <= tmpGraph.speedInter*2) goto search; - } - if (graph->nChannels == 0) { + if (graph->nChannels == 0 && graph->collNet == 0) { WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern); - for (int i=0; iintra[i] = system->nodes[GPU].nodes[i].rank; + for (int i=0; iintra[i] = system->nodes[GPU].nodes[i].gpu.rank; graph->inter[0] = graph->inter[1] = 0; - graph->speedIntra = graph->speedInter = stepSpeed; - graph->nvlink = 0; + graph->speedIntra = graph->speedInter = 0.1; + graph->typeIntra = graph->typeInter = PATH_SYS; graph->nChannels = 1; } return ncclSuccess; } ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { - INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %d/%d, nvlink %d, type %d, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, graph->nvlink, graph->type, graph->sameChannels); + INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels); int ngpus = system->nodes[GPU].count; char line[1024]; @@ -641,6 +932,18 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr return ncclSuccess; } +ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) { + char* str = getenv("NCCL_GRAPH_DUMP_FILE"); + if (str) { + struct ncclXml* xml; + NCCLCHECK(ncclCalloc(&xml, 1)); + NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml)); + NCCLCHECK(ncclTopoDumpXmlToFile(str, xml)); + free(xml); + } + return ncclSuccess; +} + ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) { *dev = graph->inter[(channelId%graph->nChannels)*2+dir]; return ncclSuccess; diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 4a670aae8a..d94129cbdf 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -11,69 +11,31 @@ #include "comm.h" #include "nvmlwrap.h" #include "net.h" +#include "coll_net.h" #include #include #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #include #include #endif +#include "xml.h" +#include "cpuset.h" #define BUSID_SIZE (sizeof("0000:00:00.0")) #define BUSID_REDUCED_SIZE (sizeof("0000:00")) -const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" }; - const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) -const char* topoLinkTypeStr[] = { "LOC", "XGMI", "PCI", "QPI", "NET" }; +const char* topoLinkTypeStr[] = { "LOC", "XGMI", "PCI", "", "", "SYS", "NET" }; +const char* topoPathTypeStr[] = { "LOC", "XGMI", "PIX", "PXB", "PHB", "SYS", "NET" }; #else -const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" }; -#endif - -#ifdef TOPO_EXPL -#include "model.h" -extern NodeModel *node_model; +const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "", "", "SYS", "NET" }; +const char* topoPathTypeStr[] = { "LOC", "NVL", "PIX", "PXB", "PHB", "SYS", "NET" }; #endif /******************************************************************/ /******************* Graph Creation Functions *********************/ /******************************************************************/ -#ifndef TOPO_EXPL -static int getNumaId(char *path) { - char npath[PATH_MAX]; - snprintf(npath, PATH_MAX, "%s/numa_node", path); - npath[PATH_MAX-1] = '\0'; - - int numaId = -1; - FILE *file = fopen(npath, "r"); - if (file == NULL) return -1; - if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; } - fclose(file); - - return numaId; -} - -static ncclResult_t getPciPath(char* busId, char** path) { - for (int i=0; igetNumaId(path); -} - -static ncclResult_t getPciPath(char* busId, char** path) { - return node_model->getGpuPciPath(busId, path); -} -#endif // Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000. ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { @@ -83,147 +45,43 @@ ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) // Find next / while (*str != '/') str--; str++; - NCCLCHECK(busIdToInt64(str, id)); + int64_t numid; + NCCLCHECK(busIdToInt64(str, &numid)); + // Ignore subdevice because those should use the same PCI link so we want to merge nodes. + numid -= numid & 0xf; + *id = numid; return ncclSuccess; } -static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) { - *index = -1; - for (int i=0; inodes[GPU].count; i++) { - if (system->nodes[GPU].nodes[i].id == id) { - *index = i; - } +static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) { + *cpu = NULL; + if (node->type == CPU) { + *cpu = node; + return ncclSuccess; + } + for (int l=0; lnlinks; l++) { + if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu)); + if (*cpu != NULL) return ncclSuccess; } return ncclSuccess; } - -static ncclResult_t getPath(int64_t id, char** path) { - char busId[] = "0000:00:00.0"; - NCCLCHECK(int64ToBusId(id, busId)); - NCCLCHECK(getPciPath(busId, path)); - return ncclSuccess; -} - -ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) { - char busId[BUSID_SIZE]; - CUDACHECK(hipDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev)); - NCCLCHECK(getPciPath(busId, path)); - return ncclSuccess; -} - - int interCpuWidth = 0; int cpuPciWidth = 0; -int p2pPciWidth = 0; -#ifndef TOPO_EXPL -static ncclResult_t getCpuWidths() { - // Check if already detected - if (interCpuWidth + cpuPciWidth + p2pPciWidth) return ncclSuccess; - - // Defaults - char cpu[256]; - sprintf(cpu, "Generic"); - cpuPciWidth = interCpuWidth = p2pPciWidth = PCI_WIDTH; - -#ifdef __PPC__ - sprintf(cpu, "ppc64"); - interCpuWidth = P9_WIDTH; -#endif -#ifdef __x86_64__ - sprintf(cpu, "x86_64"); - union { - struct { - // CPUID 0 String register order - uint32_t ebx; - uint32_t edx; - uint32_t ecx; - }; - char vendor[12]; - } cpuid0; - - asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0)); - if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel"); - else if (strncmp(cpuid0.vendor, "AuthenticAMD", 12) == 0) sprintf(cpu, "AMD"); - - if (strcmp(cpu, "Intel") == 0) { - union { - struct { - int steppingId:4; - int model:4; - int familyId:4; - int processorType:2; - int resv0:2; - int extModel:4; - int extFamily:8; - int resv1:4; - }; - uint32_t val; - } cpuid1; - asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1)); - if (cpuid1.familyId == 6 && (cpuid1.model + cpuid1.extModel * 16) >= 0x55) { // Skylake - sprintf(cpu, "Intel/Skylake (or later)"); - interCpuWidth = SKL_QPI_WIDTH; - cpuPciWidth = SKL_CPUPCI_WIDTH; - p2pPciWidth = SKL_PCI_WIDTH; - } else { - interCpuWidth = QPI_WIDTH; - } +static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* width) { + *width = LOC_WIDTH; + if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) { + *width = P9_WIDTH; + return ncclSuccess; } - else if (strcmp(cpu, "AMD") == 0) { - union { - struct { - uint32_t steppingId:4; - uint32_t model:4; - uint32_t family:4; - uint32_t resv0:4; - uint32_t extModel:4; - uint32_t extFamily:8; - uint32_t resv1:4; - }; - uint32_t val; - } cpuid1; - asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1)); - if ((cpuid1.family + cpuid1.extFamily) == 23 && (cpuid1.model + cpuid1.extModel * 16) >= 49) { - sprintf(cpu, "AMD/Rome (or later)"); - interCpuWidth = ROME_QPI_WIDTH; - cpuPciWidth = ROME_CPUPCI_WIDTH; - p2pPciWidth = ROME_PCI_WIDTH; - } else { - interCpuWidth = QPI_WIDTH; - } + if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) { + *width = ARM_WIDTH; + return ncclSuccess; + } + if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { + *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH; } -#endif - INFO(NCCL_GRAPH, "%s CPU (CPU-PCI %d, PCI/P2P %d, InterCpu %d)", cpu, cpuPciWidth, p2pPciWidth, interCpuWidth); - return ncclSuccess; -} -#else -static ncclResult_t getCpuWidths() { - char cpu[256]; - node_model->getCpuWidths(cpu, &interCpuWidth, &cpuPciWidth, &p2pPciWidth); - TRACE(NCCL_GRAPH, "%s CPU (CPU-PCI %d, PCI/P2P %d, InterCpu %d)", cpu, cpuPciWidth, p2pPciWidth, interCpuWidth); - return ncclSuccess; -} -#endif - -static ncclResult_t ncclTopoGetInterCpuWidth(int* width) { - NCCLCHECK(getCpuWidths()); - *width = interCpuWidth; - return ncclSuccess; -} -static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) { - NCCLCHECK(getCpuWidths()); - *width = cpuPciWidth; - return ncclSuccess; -} -static ncclResult_t ncclTopoGetPciWidth(int* width) { - NCCLCHECK(getCpuWidths()); - *width = p2pPciWidth; - return ncclSuccess; -} -static ncclResult_t ncclTopoGetNetWidth(int* width) { - *width = NET_WIDTH; return ncclSuccess; } @@ -234,362 +92,101 @@ enum ncclNvLinkDeviceType { ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea) }; -static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) { - char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class"; - memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1); - char* rPath = realpath(classPath, NULL); - int fd; - if ((fd = open(rPath, O_RDONLY)) == -1) { - // Could not find device. It might be because we're in a VM and - // we don't see the whole machine. This is handled silently so - // we don't want to print an INFO error. - TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno)); - return ncclSystemError; - } - free(rPath); - char pciClass[9]; - strncpy(pciClass, "0x000000", 9); - int len; - SYSCHECKVAL(read(fd, pciClass, 8), "read", len); - SYSCHECK(close(fd), "close"); - if (strcmp(pciClass, "0x068000") == 0) { - // PCI device is of type "Bridge / Other Bridge Device" (NVswitch) - *type = ncclNvLinkDeviceSwitch; - } else if (strcmp(pciClass, "0x068001") == 0) { - // PCI device is of type "Bridge: IBM Device 04ea" - *type = ncclNvLinkDeviceBridge; - } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla) - || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce) - *type = ncclNvLinkDeviceGpu; - } else { - *type = ncclNvLinkDeviceUnknown; +ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { + for (int i=0; inodes[type].count; i++) { + if (system->nodes[type].nodes[i].id == id) { + *node = system->nodes[type].nodes+i; + return ncclSuccess; + } } return ncclSuccess; } -ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) { - struct ncclTopoNode* cpuNode = NULL; - for (int c=0; cnodes[CPU].count; c++) { - if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c; +ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { + if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) { + WARN("Error : tried to create too many nodes of type %d\n", type); + return ncclInternalError; } - if (cpuNode == NULL) { // Create CPU - NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId)); + struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count; + system->nodes[type].count++; + n->type = type; + n->id = id; + if (type == GPU) { + // Create link to itself (used in some corner cases) + n->nlinks=1; + n->links[0].type = LINK_LOC; + n->links[0].remNode = n; + n->links[0].width = LOC_WIDTH; + n->gpu.dev = NCCL_TOPO_UNDEF; + n->gpu.rank = NCCL_TOPO_UNDEF; + n->gpu.cudaCompCap = NCCL_TOPO_UNDEF; + } else if (type == CPU) { + n->cpu.arch = NCCL_TOPO_UNDEF; + n->cpu.vendor = NCCL_TOPO_UNDEF; + n->cpu.model = NCCL_TOPO_UNDEF; + } else if (type == NET) { + n->net.asic = 0ULL; + n->net.port = NCCL_TOPO_UNDEF; + n->net.width = 0.0; } - NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth)); - NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth)); + *node = n; return ncclSuccess; } -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) -#define VEGA_XGMI_WIDTH 20 -#define VEGA_XGMI_MAX_LINKS 6 -extern int busIdToCudaDev(int64_t busId); - -ncclResult_t ncclTopoConnectXGMI(struct ncclComm* comm, struct ncclTopoSystem* system) { - struct ncclTopoNode* nvsNode = NULL; - - int minNvlinks = VEGA_XGMI_MAX_LINKS, minWidth = VEGA_XGMI_WIDTH; - for (int g1=0; g1nodes[GPU].count; g1++) { - int nvlinks = 0; - for(int g2=0; g2nodes[GPU].count; g2++) { - if (g1 == g2) continue; - struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1; - struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2; - uint32_t link_type, hops; - int cudaDev1 = busIdToCudaDev(comm->peerInfo[gpu1->rank].busId); - int cudaDev2 = busIdToCudaDev(comm->peerInfo[gpu2->rank].busId); -#ifndef TOPO_EXPL - if (hipExtGetLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) == hipSuccess) { -#else - if (node_model->getLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) == hipSuccess) { -#endif - if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1) { - NCCLCHECK(ncclTopoConnectNodes(gpu1, gpu2, LINK_NVL, minWidth)); - nvlinks++; +ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int index) { + struct ncclTopoNode* delNode = system->nodes[type].nodes+index; + for (int t=0; tpaths[t]); + for (int n=0; nnodes[t].count; n++) { + struct ncclTopoNode* node = system->nodes[t].nodes+n; + if (node == delNode) continue; + for (int l=0; lnlinks; l++) { + while (lnlinks && node->links[l].remNode == delNode) { + memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink)); + node->nlinks--; + } + if (lnlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) { + node->links[l].remNode--; } } } - minNvlinks = std::min(minNvlinks, nvlinks); } - int pciWidth; - NCCLCHECK(ncclTopoGetPciWidth(&pciWidth)); - system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth; - system->maxWidth = minNvlinks ? minWidth : pciWidth; - return ncclSuccess; -} -#else -ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) { - struct ncclTopoNode* nvsNode = NULL; - - int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH; - for (int g=0; gnodes[GPU].count; g++) { - struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; - int cudaMajor, cudaMinor; - NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor)); - int maxNvLinks, width; - if (cudaMajor < 6) { - maxNvLinks = 0; - width = 0; - } else if (cudaMajor == 6) { - maxNvLinks = 4; - width = PASCAL_NVLINK_WIDTH; - } else { - maxNvLinks = 6; - width = VOLTA_NVLINK_WIDTH; - } - - int nvlinks = 0; - for (int l=0; lnodes[GPU].nodes+peer, LINK_NVL, width)); - nvlinks++; - } - } else if (type == ncclNvLinkDeviceBridge) { - // Nvlink between GPU and CPU (PPC) - // Since the remote bridge does not have a valid numa_node, assume we - // are connected to the closest CPU. - char* path; - NCCLCHECK(getPath(gpu->id, &path)); - int numaId = getNumaId(path); - free(path); - NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width)); - nvlinks++; - } else { // Nvswitch - if (type == ncclNvLinkDeviceUnknown) { - // The NVLink is up but we couldn't find the PCI device on the other - // side. Assume it's an NVswitch outside a VM. - if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId); - } - if (nvsNode == NULL) { // Create nvswitch - NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0)); - } - NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH)); - NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH)); - nvlinks++; - } - } - minNvlinks = std::min(minNvlinks, nvlinks); - minWidth = std::min(minWidth, width); - } - int pciWidth; - NCCLCHECK(ncclTopoGetPciWidth(&pciWidth)); - system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth; - system->maxWidth = minNvlinks ? minWidth : pciWidth; - return ncclSuccess; -} -#endif - -ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) { - struct ncclTopoNode* lastNode = endNode; - int pciWidth; - NCCLCHECK(ncclTopoGetPciWidth(&pciWidth)); - // Find intermediate PCI switches - int slashCount = 0; - int offsetRC = 0; - while (offsetRC < strlen(path)) { - if (path[offsetRC] == '/') slashCount++; - if (slashCount == 4) break; - offsetRC++; - } - int offset = strlen(path); - slashCount = 0; - while (--offset > offsetRC) { - if (path[offset] == '/') { - slashCount++; - // Find if already existing - if ((slashCount%2) == 0) { - int64_t pciId; - NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId)); - for (int p=0; pnodes[PCI].count; p++) { - if (system->nodes[PCI].nodes[p].id == pciId) { - // Found our PCI switch. Attach and stop since the rest should already - // be connected - NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth)); - NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth)); - return ncclSuccess; - } - } - struct ncclTopoNode* pciNode; - NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId)); - NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth)); - NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth)); - lastNode = pciNode; - } - } - } - // Then attach to a CPU node - int numaId = getNumaId(path); - int width; - NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width)); - NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width)); + memmove(delNode, delNode+1, (system->nodes[type].count-index-1)*sizeof(struct ncclTopoNode)); + system->nodes[type].count--; return ncclSuccess; } -// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports. -#include -#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid" -#ifndef TOPO_EXPL -uint64_t getIbGuid(char* path) { - uint64_t guid = 0ULL; - char guidPath[PATH_MAX]; - snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path); - // PATH has a wildcard in it so use glob() - glob_t globbuf; - glob(guidPath, 0, NULL, &globbuf); - if (globbuf.gl_pathc > 0) - strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX); - globfree(&globbuf); - guidPath[PATH_MAX-1] = '\0'; - FILE *file = fopen(guidPath, "r"); - if (file != NULL) { - uint64_t a, b, c, d; - if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) { - guid = (a << 48) + (b << 32) + (c<<16) + d; - TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid); - } - fclose(file); +ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) { + // Aggregate links into higher width for NVLink + struct ncclTopoLink* link; + for (link = node->links; link->remNode; link++) { + if (link->remNode == remNode && link->type == type) break; } - return guid; -} -#else -uint64_t getIbGuid(char* path) { - return node_model->getIbGuid(path); -} -#endif + if (link->remNode == NULL) node->nlinks++; + link->type = type; + link->remNode = remNode; + link->width += width; -struct netInfo { - char* path; - int64_t nic; - uint64_t asic; - int port; - int net; -}; - -ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) { - for (int n=0; nnic = n; - info->asic = n; - info->port = 0; - info->net = n; - if (info->path && (ibGuid = getIbGuid(info->path)) != 0) { - info->asic = ibGuid; - - // Ignore PCI subdevice when computing the ID to merge multi-port cards - // and make them use the same PCI link. - char* path = strdup(info->path); - path[strlen(path)-1]='0'; - NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic)); - free(path); - - // Same PCI path -> different ports of the same NIC - for (int i=0; inic) info->port++; - - // Same GUID -> same network links as the other NIC - for (int i=0; iasic && netInfos[i].port == info->port) info->net = netInfos[i].net; - } - INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net); + // Sort links in BW descending order + struct ncclTopoLink linkSave; + memcpy(&linkSave, link, sizeof(struct ncclTopoLink)); + while (link != node->links) { + if ((link-1)->width >= linkSave.width) break; + memcpy(link, link-1, sizeof(struct ncclTopoLink)); + link--; } + memcpy(link, &linkSave, sizeof(struct ncclTopoLink)); return ncclSuccess; } -ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) { - for (int g=0; gnodes[GPU].count; g++) { - struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; - char* path; - NCCLCHECK(getPath(gpu->id, &path)); - NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path)); - free(path); - } - - // Connect the NICs - int netDevCount; - NCCLCHECK(ncclNetDevices(&netDevCount)); - int netWidth; - NCCLCHECK(ncclTopoGetNetWidth(&netWidth)); - - struct netInfo* netInfos; - NCCLCHECK(ncclCalloc(&netInfos, netDevCount)); - - for (int n=0; nnodes[NIC].count; i++) { - if (system->nodes[NIC].nodes[i].id == info->nic) { - nicNode = system->nodes[NIC].nodes+i; - break; - } - } - if (!nicNode) { - NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic)); - if (info->path) { - // Create the PCI path - NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path)); - } else { - // This is probably a virtual NIC. Just attach it directly to CPU 0 - int width; - NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width)); - NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width)); - } - } - free(info->path); - - // Create the network side - struct ncclTopoNode* netNode; - NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n)); - - // Use rank to store the net information - netNode->rank = info->net; - - NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth)); - NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth)); - } - free(netInfos); - +ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) { // And connect all CPU nodes together for (int n=0; nnodes[CPU].count; n++) { for (int p=0; pnodes[CPU].count; p++) { if (n == p) continue; - int width; - NCCLCHECK(ncclTopoGetInterCpuWidth(&width)); - NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width)); + float width; + NCCLCHECK(ncclTopoGetInterCpuWidth(system->nodes[CPU].nodes+n, &width)); + NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, width)); } } return ncclSuccess; @@ -597,7 +194,9 @@ ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) { static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) { if (node->type == GPU) { - sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank); + sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank); + } else if (node->type == CPU) { + sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model); } else { sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id); } @@ -607,14 +206,14 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node->links+l; if (link->type == LINK_LOC) continue; - if (link->remNode != prevNode) { - sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width); + if (link->type != LINK_PCI || link->remNode != prevNode) { + sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->width); int nextOffset = strlen(line); if (link->type == LINK_PCI) { NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset)); } else { if (link->remNode->type == NET) { - sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank); + sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.width); } else { sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id); } @@ -626,7 +225,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN } ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) { - INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed); + INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f ===", s->maxWidth); char line[1024]; for (int n=0; nnodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0)); INFO(NCCL_GRAPH, "=========================================="); @@ -660,92 +259,457 @@ static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* // 1. NVLinks (already the case) // 2. PCI down // 3. PCI up -// 4. QPI (already the case) +// 4. SYS (already the case) ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) { for (int n=0; nnodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL)); return ncclSuccess; } -ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { - struct ncclTopoSystem* s; - NCCLCHECK(ncclCalloc(&s, 1)); - nvmlDevice_t* nvmlDevs; - int g = 0; - NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks)); - for (int r=0; rnRanks; r++) { - if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) { - // Consider the GPU as outside of our node if we can't see it through NVML. - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId)); - if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue; - g++; - struct ncclTopoNode* gpuNode; - NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId)); - gpuNode->rank = r; +ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) { + int dev; + NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev)); + + struct ncclTopoNode* net; + NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev)); + const char* str; + NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str)); + if (str) sscanf(str, "0x%lx", &net->net.asic); + else net->net.asic = dev; + + ncclDebugNoWarn = NCCL_GRAPH; + int mbps; + if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0; + if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1 + net->net.width = mbps / 8000.0; + if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0; + if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0; + if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS; + if (ncclCollNet && xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0; + ncclDebugNoWarn = 0; + + NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width)); + NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width)); + return ncclSuccess; +} + +ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) { + for (int s=0; snSubs; s++) { + struct ncclXmlNode* xmlNet = xmlNic->subs[s]; + if (strcmp(xmlNet->name, "net") != 0) continue; + int index; + NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index)); + if (index == -1) continue; + NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic)); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { + NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap)); + NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank)); + NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev)); + NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport)); + // Do not go any further, nvlinks will be added in a second pass + return ncclSuccess; +} + +struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x030200", GPU }, { "0x030000", GPU }, { "0x038000", GPU }, { "0x020700", NIC }, { "0x020000", NIC }, { NULL, 0 } }; +struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { NULL, 0 } }; // x100 Mbps per lane +ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) { + const char* str; + + int type; + NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str)); + NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass)); + + int64_t busId; + NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str)); + NCCLCHECK(busIdToInt64(str, &busId)); + + struct ncclTopoNode* node = NULL; + if (type == GPU) { + struct ncclXmlNode* xmlGpu; + NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu)); + if (xmlGpu == NULL) return ncclSuccess; + int index; + NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index)); + if (index == -1) return ncclSuccess; + NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId)); + NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node)); + } + if (type == NIC) { + struct ncclXmlNode* xmlNic; + NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic)); + if (xmlNic == NULL) return ncclSuccess; + + // Ignore sub device ID and merge multi-port NICs into one PCI device. + busId &= 0xfffffffffffffff0; + struct ncclTopoNode* nicNode = NULL; + NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId)); + if (nicNode == NULL) { + NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId)); + node = nicNode; // Connect it to parent later on + } + NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode)); + } else if (type == PCI) { + NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId)); + for (int s=0; snSubs; s++) { + struct ncclXmlNode* xmlSubPci = xmlPci->subs[s]; + NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node)); } } + if (node) { + int width, speed; + NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width)); + NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str)); + + // Manage cases where speed was not indicated in /sys + if (width == 0) width = 16; + if (strlen(str) == 0 || strcasecmp(str, "Unknown speed") == 0) str = "8 GT/s"; + + NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end) + + NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0)); + NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0)); + } + return ncclSuccess; +} + +struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } }; +struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { NULL, 0 } }; + +ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) { + int numaId; + NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId)); + struct ncclTopoNode* cpu; + NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId)); + const char* str; + NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str)); + if (str != NULL) { + NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity)); + } + + NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str)); + NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch)); + if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) { + NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str)); + NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor)); + if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { + int familyId, modelId; + NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); + NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); + cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW; + } + } + for (int s=0; snSubs; s++) { + struct ncclXmlNode* node = xmlCpu->subs[s]; + if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu)); + if (strcmp(node->name, "nic") == 0) { + struct ncclTopoNode* nic = NULL; + NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0)); + if (nic == NULL) { + NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0)); + NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH)); + NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH)); + } + NCCLCHECK(ncclTopoAddNic(node, system, nic)); + } + } + return ncclSuccess; +} + #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - NCCLCHECK(ncclTopoConnectXGMI(comm, s)); -#else - NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s)); -#endif - NCCLCHECK(ncclTopoConnectPCI(s)); - - free(nvmlDevs); - NCCLCHECK(ncclTopoSortSystem(s)); - *system = s; - return ncclSuccess; -} - -ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) { - int g1, g2; - NCCLCHECK(idToIndex(system, busId1, &g1)); - NCCLCHECK(idToIndex(system, busId2, &g2)); - *nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL; - return ncclSuccess; -} - -ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) { - int g; - NCCLCHECK(idToIndex(system, busId, &g)); - for (int i=0; inodes[GPU].count; i++) { - if (i == g) continue; - if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) { - *nvlink = 1; - return ncclSuccess; +ncclResult_t ncclTopoAddXGMI(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) { + if (strcmp(node->name, "xgmi") == 0) { + struct ncclTopoNode* gpu = NULL; + int64_t pBusId; + NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); + NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId)); + if (gpu == NULL) { + WARN("Add XGMI error : could not find GPU %lx\n", pBusId); + return ncclInternalError; + } + int count; + NCCLCHECK(xmlGetAttrInt(node, "count", &count)); + const char* targetClass; + NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass)); + int targetType; + NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass)); + struct ncclTopoNode* remote = NULL; + if (targetType == GPU) { + // NVL P2P connection to another GPU + const char* target; + NCCLCHECK(xmlGetAttrStr(node, "target", &target)); + int64_t busId; + NCCLCHECK(busIdToInt64(target, &busId)); + NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId)); + } else if (targetType == CPU) { + // NVL connection to the local CPU + NCCLCHECK(findLocalCpu(gpu, &remote)); + } else { + if (system->nodes[NVS].count == 0) { + NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0)); + } else { + remote = system->nodes[NVS].nodes; + } + } + if (remote) { + int nvlSpeed = VEGA_XGMI_WIDTH; + NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed)); + if (remote->type != GPU) { + NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed)); + } + } + } else { + const char* busId; + NCCLCHECK(xmlGetAttr(node, "busid", &busId)); + for (int s=0; snSubs; s++) { + NCCLCHECK(ncclTopoAddXGMI(node->subs[s], system, busId ? busId : parentBusId)); } } - *nvlink = 0; return ncclSuccess; } - -static int pathDistance(struct ncclTopoLinkList* links) { - int distance = PATH_PIX; - if (links->count > 2) distance = PATH_PXB; - for (int l=0; lcount; l++) { - // PHB if we go through 1 CPU, SYS if we go through 2 CPUs - if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB; +#else +ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) { + if (strcmp(node->name, "nvlink") == 0) { + struct ncclTopoNode* gpu = NULL; + int64_t pBusId; + NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); + NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId)); + if (gpu == NULL) { + WARN("Add NVLink error : could not find GPU %lx\n", pBusId); + return ncclInternalError; + } + int count; + NCCLCHECK(xmlGetAttrInt(node, "count", &count)); + const char* targetClass; + NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass)); + int targetType; + NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass)); + struct ncclTopoNode* remote = NULL; + if (targetType == GPU) { + // NVL P2P connection to another GPU + const char* target; + NCCLCHECK(xmlGetAttrStr(node, "target", &target)); + int64_t busId; + NCCLCHECK(busIdToInt64(target, &busId)); + NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId)); + } else if (targetType == CPU) { + // NVL connection to the local CPU + NCCLCHECK(findLocalCpu(gpu, &remote)); + } else { + if (system->nodes[NVS].count == 0) { + NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0)); + } else { + remote = system->nodes[NVS].nodes; + } + } + if (remote) { + int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH; + NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed)); + if (remote->type != GPU) { + NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed)); + } + } + } else { + const char* busId; + NCCLCHECK(xmlGetAttr(node, "busid", &busId)); + for (int s=0; snSubs; s++) { + NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId)); + } } - return distance; + return ncclSuccess; } +#endif + +ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) { + NCCLCHECK(ncclCalloc(topoSystem, 1)); + struct ncclXmlNode* topNode; + NCCLCHECK(xmlFindTag(xml, "system", &topNode)); + for (int s=0; snSubs; s++) { + struct ncclXmlNode* node = topNode->subs[s]; + if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem)); + } +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + NCCLCHECK(ncclTopoAddXGMI(topNode, *topoSystem, NULL)); +#else + NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL)); +#endif + + NCCLCHECK(ncclTopoConnectCpus(*topoSystem)); + NCCLCHECK(ncclTopoSortSystem(*topoSystem)); -ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) { - int g1, g2; - NCCLCHECK(idToIndex(system, busId1, &g1)); - NCCLCHECK(idToIndex(system, busId2, &g2)); - *distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2); return ncclSuccess; } -ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) { - int g; - NCCLCHECK(idToIndex(system, busId, &g)); - *distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev); +NCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0); + +// Only set values if not already set +static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) { + int index; + NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); + if (index == -1) { + index = node->nAttrs++; + strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); + snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value); + } + return ncclSuccess; +} +static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attrName, const uint64_t value) { + int index; + NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); + if (index == -1) { + index = node->nAttrs++; + strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); + snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value); + } return ncclSuccess; } -ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) { - *count = system->nodes[CPU].count; + +ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { + struct ncclXml* xml; + NCCLCHECK(ncclCalloc(&xml, 1)); + char* xmlTopoFile = getenv("NCCL_TOPO_FILE"); + if (xmlTopoFile) { + NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml)); + } + if (xml->maxIndex == 0) { + // Create top tag + struct ncclXmlNode* top; + NCCLCHECK(xmlAddNode(xml, NULL, "system", &top)); + NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION)); + } + + // Auto-detect GPUs if needed + for (int r=0; rnRanks; r++) { + if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) { + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId)); + struct ncclXmlNode* node; + NCCLCHECK(ncclTopoFillGpu(xml, busId, &node)); + NCCLCHECK(xmlSetAttrInt(node, "rank", r)); + NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport)); + } + } + // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, + // so we start with collnet so that it has precedence. + int netDevCount = 0; + if (ncclCollNet) { + NCCLCHECK(collNetDevices(&netDevCount)); + for (int n=0; nrank == ncclParamTopoDumpFileRank()) { + NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml)); + } + + NCCLCHECK(ncclTopoGetSystemFromXml(xml, system)); + free(xml); + return ncclSuccess; +} + +/****************************/ +/* External query functions */ +/****************************/ + +ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model) { + *arch = system->nodes[CPU].nodes[0].cpu.arch; + *vendor = system->nodes[CPU].nodes[0].cpu.vendor; + *model = system->nodes[CPU].nodes[0].cpu.model; + return ncclSuccess; +} + +NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); + +ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) { + struct ncclTopoNode* cpu = NULL, *gpu = NULL; + for (int g=0; gnodes[GPU].count; g++) { + if (system->nodes[GPU].nodes[g].gpu.rank == rank) { + gpu = system->nodes[GPU].nodes+g; + // Find closer CPU + int cpuIndex = -1, minHops = 0; + for (int c=0; cnodes[CPU].count; c++) { + int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count; + if (cpuIndex == -1 || nHops < minHops) { + cpuIndex = c; + minHops = nHops; + } + } + cpu = system->nodes[CPU].nodes+cpuIndex; + } + } + if (cpu == NULL) { + WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank); + return ncclInternalError; + } + + // Query the CPU affinity set we were provided + cpu_set_t mask; + SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); + +#ifdef ENABLE_TRACE + { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&mask, affinityStr)); + TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr); + } +#endif + + // Get the affinity of the CPU close to our GPU. + cpu_set_t cpuMask = cpu->cpu.affinity; + +#ifdef ENABLE_TRACE + { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr)); + TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr); + } +#endif + + cpu_set_t finalMask; + if (ncclParamIgnoreCpuAffinity()) + // Ignore the CPU affinity set and use the GPU one instead + finalMask = cpuMask; + else + // Use a subset of the GPU affinity set + CPU_AND(&finalMask, &mask, &cpuMask); + + // If there is a non empty set, use it to set affinity + if (CPU_COUNT(&finalMask)) { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); + INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr); + SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity"); + } return ncclSuccess; } diff --git a/src/graph/topo.h b/src/graph/topo.h index f9e9ec5a7e..f07e6a1310 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -10,27 +10,28 @@ #include "graph.h" #include "core.h" +#include -#define LOC_WIDTH 5000 -#define PASCAL_NVLINK_WIDTH 18 -#define VOLTA_NVLINK_WIDTH 21 -#define PCI_WIDTH 12 // PCI Gen3 x16 -#define QPI_WIDTH 8 -#define SKL_QPI_WIDTH 12 -#define SKL_PCI_WIDTH 12 -#define SKL_CPUPCI_WIDTH 12 -#define P9_WIDTH 32 -#define NET_WIDTH 12 // 100Gbit -#define ROME_QPI_WIDTH 18 -#define ROME_PCI_WIDTH 18 -#define ROME_CPUPCI_WIDTH 18 +#define LOC_WIDTH 5000.0 +#define PASCAL_NVLINK_WIDTH 18.0 +#define VOLTA_NVLINK_WIDTH 21.0 +#define PCI_WIDTH 12.0 // PCI Gen3 x16 +#define QPI_WIDTH 6.0 +#define SKL_QPI_WIDTH 9.0 +#define P9_WIDTH 32.0 +#define ARM_WIDTH 6.0 +#define NET_WIDTH 12.0 // 100Gbit +#define VEGA_XGMI_WIDTH 20.0 +#define ROME_QPI_WIDTH 18.0 +#define ROME_PCI_WIDTH 18.0 +#define ROME_CPUPCI_WIDTH 18.0 -// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, to GPU -// to GPU traffic consumed more PCI bandwidth. +// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU +// to GPU traffic consumes more PCI bandwidth. #define INTEL_P2P(speed) (speed*9/12) #define INTEL_P2P_OVERHEAD(speed) (speed*12/9) -#define NCCL_TOPO_NODE_TYPES 6 +#define NCCL_TOPO_NODE_TYPES 7 #define GPU 0 #define PCI 1 #define NVS 2 @@ -39,37 +40,72 @@ #define NET 5 extern const char* topoNodeTypeStr[]; +// We want link types and path types to match as much as possible #define LINK_LOC 0 #define LINK_NVL 1 #define LINK_PCI 2 -#define LINK_QPI 3 -#define LINK_NET 4 +// Skipping 3 for PATH_PXB +// Skipping 4 for PATH_PHB +#define LINK_SYS 5 +#define LINK_NET 6 extern const char* topoLinkTypeStr[]; +#define PATH_LOC 0 +#define PATH_NVL 1 +#define PATH_PIX 2 +#define PATH_PXB 3 +#define PATH_PHB 4 +#define PATH_SYS 5 +#define PATH_NET 6 +extern const char* topoPathTypeStr[]; + struct ncclTopoNode; struct ncclTopoLink { int type; - int width; + float width; struct ncclTopoNode* remNode; }; #define NCCL_TOPO_MAX_LINKS 32 #define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES) -#define SELECT_PATH 1 -#define SELECT_LAST 2 - -#define NET_GDR_MASK 0x70000000 struct ncclTopoLinkList { struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS]; int count; - int width; + float width; int type; }; +#define NCCL_TOPO_CPU_INTEL_BDW 1 +#define NCCL_TOPO_CPU_INTEL_SKL 2 + +#define NCCL_TOPO_UNDEF (-1) + struct ncclTopoNode { int type; int64_t id; - int rank; + // Type specific data + union { + struct { + int dev; // NVML dev number + int rank; + int cudaCompCap; + int gdrSupport; + }gpu; + struct { + uint64_t asic; + int port; + float width; + int gdrSupport; + int collSupport; + int maxChannels; + }net; + struct { + int arch; + int vendor; + int model; + cpu_set_t affinity; + }cpu; + }; int nlinks; struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS]; // Pre-computed paths to GPUs and NICs @@ -85,60 +121,29 @@ struct ncclTopoNodeSet { struct ncclTopoSystem { struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES]; - int maxSpeed; - int maxWidth; - int searchInitDone; + float maxWidth; }; -static ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { +ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id); +ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id); +ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id); +ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width); +ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system); +ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system); + +ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem); +ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph); +ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml); + +static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) { + *index = -1; for (int i=0; inodes[type].count; i++) { if (system->nodes[type].nodes[i].id == id) { - *node = system->nodes[type].nodes+i; + *index = i; return ncclSuccess; } } - if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) { - WARN("Error : tried to create too many nodes of type %d\n", type); - return ncclInternalError; - } - struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count; - system->nodes[type].count++; - n->type = type; - n->id = id; - if (type == GPU) { - // Create link to itself (used in some corner cases) - n->nlinks=1; - n->links[0].type = LINK_LOC; - n->links[0].remNode = n; - n->links[0].width = LOC_WIDTH; - } - *node = n; - return ncclSuccess; + return ncclInternalError; } -static ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, int width) { - // Aggregate links into higher width for NVLink - struct ncclTopoLink* link; - for (link = node->links; link->remNode; link++) { - if (link->remNode == remNode && link->type == type) break; - } - if (link->remNode == NULL) node->nlinks++; - link->type = type; - link->remNode = remNode; - link->width += width; - - // Sort links in BW descending order - struct ncclTopoLink linkSave; - memcpy(&linkSave, link, sizeof(struct ncclTopoLink)); - while (link != node->links) { - if ((link-1)->width >= linkSave.width) break; - memcpy(link, link-1, sizeof(struct ncclTopoLink)); - link--; - } - memcpy(link, &linkSave, sizeof(struct ncclTopoLink)); - return ncclSuccess; -} - -ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system); - #endif diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 957c63e973..14f2ed4ec1 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -53,12 +53,12 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li } static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" }; -static const char* ncclAlgoStr[] = { "Tree", "Ring" }; +static const char* ncclAlgoStr[] = { "Tree", "Ring", "CollNet" }; static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" }; // Latencies in us, Bandwidths in GB/s // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } -static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 37.9, 37.9, 40.4 }, { 20.5, 20.5, 27.9 } }; +static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 37.9, 37.9, 40.4 }, { 20.5, 20.5, 27.9 }, { 37.9, 37.9, 40.4 } }; // NVLink, PCI, Network #define NCCL_HW_NVLINK 0 @@ -67,29 +67,32 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 37.9 // Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network). static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { 1.2, 1.2, 3.8 }, /* Ring (LL/LL128/Simple)*/ { 2.3, 2.3, 2.7 } }, + { /* Tree (LL/LL128/Simple)*/ { 1.2, 1.2, 3.8 }, /* Ring (LL/LL128/Simple)*/ { 2.3, 2.3, 2.7 }, /* CollNet (LL/LL128/Simple)*/ { 1.2, 1.2, 3.8 } }, /* PCI */ - { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 1.3, 1.3, 1.9 } }, + { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 1.3, 1.3, 1.9 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 9.8, 9.8, 19.5 }, /* Ring (LL/LL128/Simple)*/ { 2.0, 2.0, 4.5 } } + { /* Tree (LL/LL128/Simple)*/ { 9.8, 9.8, 19.5 }, /* Ring (LL/LL128/Simple)*/ { 2.0, 2.0, 4.5 }, /* CollNet (LL/LL128/Simple)*/ { 9.8, 9.8, 19.5 } } }; // LL128 max BW for the different collectives static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 }; -ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph) { - int simpleDefaultThreads = (treeGraph->speedIntra*treeGraph->nChannels <= 12) ? 256 : NCCL_MAX_NTHREADS; - comm->maxThreads[NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads); - comm->maxThreads[NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS); - comm->maxThreads[NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS); - - INFO(NCCL_INIT, "Threads per block : %d/%d/%d", comm->maxThreads[NCCL_PROTO_LL], comm->maxThreads[NCCL_PROTO_LL128], comm->maxThreads[NCCL_PROTO_SIMPLE]); +ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) { + int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS; + comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = + getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads); + comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = + getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS); + comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] = + getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS); + comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] = + getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS); if (comm->nRanks <= 1) return ncclSuccess; - struct ncclTopoGraph* graphs[2] = { treeGraph, ringGraph }; - int intraHw[2], hw[2]; - for (int a=0; anvlink ? NCCL_HW_NVLINK : NCCL_HW_PCI; + struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph }; + int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; + for (int a=0; atypeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI; for (int a=0; anNodes == 1 ? intraHw[a] : NCCL_HW_NET; for (int coll=0; collnRanks; for (int a=0; anNodes <= 2 ? graphs[a]->speedIntra : graphs[a]->speedInter; - float busBw = graphs[a]->nChannels * speed * 0.6; + float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter; + float busBw = graphs[a]->nChannels * speed; // Various model refinements if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/5.0; @@ -110,9 +113,12 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.27, comm->nNodes > 1 ? 70.0 : 90.0); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/2.3; if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0; + if (a == NCCL_ALGO_COLLNET) busBw *= .9; + if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides + if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128 // Convert bus BW to algorithm BW - float ratio = a == NCCL_ALGO_TREE ? .5 : (1.0 * comm->nRanks) / nsteps; + float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * comm->nRanks) / nsteps; comm->bandwidths[coll][a][p] = busBw * ratio; comm->latencies[coll][a][p] = baseLat[a][p]; @@ -128,11 +134,16 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom } else { comm->latencies[coll][a][p] += nsteps*lat; } - } else { + } else if (a == NCCL_ALGO_TREE) { float intraLat = hwLat[intraHw[a]][a][p]; float interLat = hwLat[NCCL_HW_NET][a][p]; comm->latencies[coll][a][p] += 2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat); + } else { + float intraLat = hwLat[intraHw[a]][a][p]; + float interLat = hwLat[NCCL_HW_NET][a][p]; + comm->latencies[coll][a][p] += + 2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat; } } } @@ -141,7 +152,7 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom // Protocols/Algorithms enable/disable, and user overrides. // All are enabled except ll128 which is enabled by default only in certain cases. int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; - int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1 }; + int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1 }; const char *protoStr = getenv("NCCL_PROTO"); if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable)); @@ -152,30 +163,32 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom int pEnable = protoEnable[p]; if (pEnable == 2 && p == NCCL_PROTO_LL128) { // Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption. - pEnable = (graphs[a]->type <= LINK_PCI) && graphs[a]->nvlink && minCompCap == 70 && maxCompCap == 70 ? 1 : 0; + pEnable = (graphs[a]->typeInter <= LINK_PCI) && graphs[a]->typeIntra == LINK_NVL && minCompCap == 70 && maxCompCap == 70 ? 1 : 0; } if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; } if (comm->rank == 0) { char line[1024]; - int offset = 0; sprintf(line, "Latency/AlgBw |"); - offset = strlen(line); for (int a=0; amaxThreads[a][p]); } } INFO(NCCL_TUNING, "%s", line); for (int c=0; clatencies[c][a][p], comm->bandwidths[c][a][p]); - offset = strlen(line); + sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]); } } INFO(NCCL_TUNING, "%s", line); @@ -202,12 +215,41 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom } } - INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld", + INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld/%ld/%ld", comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL], comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128], comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128], - comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]); + comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], + comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL], + comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128], + comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE]); + return ncclSuccess; +} + +// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction +// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB. +static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = { + { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .84, .49, .42, .60, .75, .87, .94, .94, .99, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }, + { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .84, .49, .42, .60, .75, .87, .94, .94, .99, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }, + { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .41, .27, .25, .39, .46, .72, .76, .87, .92, .97, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 } +}; + +static float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = { + { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .25, .41, .55, .56, .78, .94, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }, + { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .25, .41, .55, .56, .78, .94, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }, + { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .04, .08, .09, .09, .11, .13, .25, .40, .59, .76, .86, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 } +}; + +ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time) { + float bw = info->comm->bandwidths[info->coll][algorithm][protocol]; + if (bw == 0) { + *time = -1.0; return ncclSuccess; + } + int logSize = log2i(info->nBytes>>6); + if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize]; + else if (algorithm == NCCL_ALGO_RING && logSize < 22) bw *= ringCorrectionFactor[protocol][logSize]; + *time = info->comm->latencies[info->coll][algorithm][protocol] + (info->nBytes) / (1000 * bw); return ncclSuccess; } diff --git a/src/graph/xml.cc b/src/graph/xml.cc new file mode 100644 index 0000000000..a2b3a77d38 --- /dev/null +++ b/src/graph/xml.cc @@ -0,0 +1,819 @@ +/************************************************************************* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include +#include +#include +#include +#include "core.h" +#include "nvmlwrap.h" +#include "xml.h" + +/*******************/ +/* XML File Parser */ +/*******************/ + +ncclResult_t xmlGetChar(FILE* file, char* c) { + if (fread(c, 1, 1, file) == 0) { + WARN("XML Parse : Unexpected EOF"); + return ncclInternalError; + } + return ncclSuccess; +} + +ncclResult_t xmlGetValue(FILE* file, char* value, char* last) { + char c; + NCCLCHECK(xmlGetChar(file, &c)); + if (c != '"' && c != '\'') { +#if INT_OK + int o = 0; + do { + value[o++] = c; + NCCLCHECK(xmlGetChar(file, &c)); + } while (c >= '0' && c <= '9'); + value[o] = '\0'; + *last = c; + return ncclSuccess; +#else + WARN("XML Parse : Expected (double) quote."); + return ncclInternalError; +#endif + } + int o = 0; + do { + NCCLCHECK(xmlGetChar(file, &c)); + value[o++] = c; + } while (c != '"'); + value[o-1] = '\0'; + NCCLCHECK(xmlGetChar(file, last)); + return ncclSuccess; +} + +ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) { + char c; + char* ptr = name; + int o = 0; + do { + NCCLCHECK(xmlGetChar(file, &c)); + if (c == '=') { + ptr[o] = '\0'; + if (value == NULL) { + WARN("XML Parse : Unexpected value with name %s\n", ptr); + return ncclInternalError; + } + return xmlGetValue(file, value, last); + } + ptr[o] = c; + if (o == MAX_STR_LEN-1) { + ptr[o] = '\0'; + WARN("Error : name %s too long (max %d)", ptr, MAX_STR_LEN); + return ncclInternalError; + } + o++; + } while (c != ' ' && c != '>' && c != '/' && c != '\n' && c != '\r'); + ptr[o-1] = '\0'; + *last = c; + return ncclSuccess; +} + +// Shift the 3-chars string by one char and append c at the end +#define SHIFT_APPEND(s, c) do { s[0]=s[1]; s[1]=s[2]; s[2]=c; } while(0) +ncclResult_t xmlSkipComment(FILE* file, char* start, char next) { + // Start from something neutral with \0 at the end. + char end[4] = "..."; + + // Inject all trailing chars from previous reads. We don't need + // to check for --> here because there cannot be a > in the name. + for (int i=0; i" + while (strcmp(end, "-->") != 0) { + int c; + if (fread(&c, 1, 1, file) != 1) { + WARN("XML Parse error : unterminated comment"); + return ncclInternalError; + } + SHIFT_APPEND(end, c); + } + return ncclSuccess; +} + +ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) { + node->type = NODE_TYPE_NONE; + char c = ' '; + while (c == ' ' || c == '\n' || c == '\r') { + if (fread(&c, 1, 1, file) == 0) return ncclSuccess; + } + if (c != '<') { + WARN("XML Parse error : expecting '<', got '%c'", c); + return ncclInternalError; + } + // Read XML element name + NCCLCHECK(xmlGetToken(file, node->name, NULL, &c)); + + // Check for comments + if (strncmp(node->name, "!--", 3) == 0) { + NCCLCHECK(xmlSkipComment(file, node->name+3, c)); + return xmlGetNode(file, node); + } + + // Check for closing tag + if (node->name[0] == '\0' && c == '/') { + node->type = NODE_TYPE_CLOSE; + // Re-read the name, we got '/' in the first call + NCCLCHECK(xmlGetToken(file, node->name, NULL, &c)); + if (c != '>') { + WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name); + return ncclInternalError; + } + return ncclSuccess; + } + + node->type = NODE_TYPE_OPEN; + + // Get Attributes + int a = 0; + while (c == ' ') { + NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c)); + if (a == MAX_ATTR_COUNT) { + INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT); + // Actually we need to still consume the extra attributes so we have an extra one. + } else a++; + } + node->nAttrs = a; + if (c == '/') { + node->type = NODE_TYPE_SINGLE; + char str[MAX_STR_LEN]; + NCCLCHECK(xmlGetToken(file, str, NULL, &c)); + } + if (c != '>') { + WARN("XML Parse : expected >, got '%c'", c); + return ncclInternalError; + } + return ncclSuccess; +} + +typedef ncclResult_t (*xmlHandlerFunc_t)(FILE*, struct ncclXml*, struct ncclXmlNode*); + +struct xmlHandler { + const char * name; + xmlHandlerFunc_t func; +}; + +ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head, struct xmlHandler handlers[], int nHandlers) { + if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess; + while (1) { + if (xml->maxIndex == MAX_NODES) { + WARN("Error : XML parser is limited to 1024 nodes\n"); + return ncclInternalError; + } + struct ncclXmlNode* node = xml->nodes+xml->maxIndex; + memset(node, 0, sizeof(struct ncclXmlNode)); + NCCLCHECK(xmlGetNode(file, node)); + if (node->type == NODE_TYPE_NONE) { + if (head) { + WARN("XML Parse : unterminated %s", head->name); + return ncclInternalError; + } else { + // All done + return ncclSuccess; + } + } + if (head && node->type == NODE_TYPE_CLOSE) { + if (strcmp(node->name, head->name) != 0) { + WARN("XML Mismatch : %s / %s", head->name, node->name); + return ncclInternalError; + } + return ncclSuccess; + } + int found = 0; + for (int h=0; hname, handlers[h].name) == 0) { + if (head) head->subs[head->nSubs++] = node; + node->parent = head; + node->nSubs = 0; + xml->maxIndex++; + NCCLCHECK(handlers[h].func(file, xml, node)); + found = 1; + break; + } + } + if (!found) { + if (nHandlers) INFO(NCCL_GRAPH, "Ignoring element %s", node->name); + NCCLCHECK(xmlLoadSub(file, xml, node, NULL, 0)); + } + } +} + +/**************/ +/* XML Writer */ +/**************/ + +ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node) { + for (int i=0; iname); + + for (int a=0; anAttrs; a++) { + fprintf(file, " %s=\"%s\"", node->attrs[a].key, node->attrs[a].value); + } + if (node->nSubs == 0) { + fprintf(file, "/>\n"); + } else { + fprintf(file, ">\n"); + for (int s=0; snSubs; s++) { + NCCLCHECK(ncclTopoDumpXmlRec(indent+2, file, node->subs[s])); + } + for (int i=0; i\n", node->name); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml) { + FILE* file = fopen(xmlTopoFile, "w"); + if (file == NULL) { + WARN("Unable to open %s, not dumping topology.", xmlTopoFile); + return ncclSuccess; + } + NCCLCHECK(ncclTopoDumpXmlRec(0, file, xml->nodes)); + fclose(file); + return ncclSuccess; +} + +/****************************************/ +/* Parser rules for our specific format */ +/****************************************/ + +ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + struct xmlHandler handlers[] = { { "xgmi", ncclTopoXmlLoadNvlink } }; +#else + struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink } }; +#endif + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + struct xmlHandler handlers[] = { { "net", ncclTopoXmlLoadNet } }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic} }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlLoadCpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "nic", ncclTopoXmlLoadNic } }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlLoadSystem(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + int version; + NCCLCHECK(xmlGetAttrInt(head, "version", &version)); + if (version != NCCL_TOPO_XML_VERSION) { + WARN("XML Topology has wrong version %d, %d needed", version, NCCL_TOPO_XML_VERSION); + return ncclInvalidUsage; + } + const char* name; + NCCLCHECK(xmlGetAttr(head, "name", &name)); + if (name != NULL) INFO(NCCL_GRAPH, "Loading topology %s", name); + else INFO(NCCL_GRAPH, "Loading unnamed topology"); + + struct xmlHandler handlers[] = { { "cpu", ncclTopoXmlLoadCpu } }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml) { + FILE* file = fopen(xmlTopoFile, "r"); + if (file == NULL) { + WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno)); + return ncclSuccess; + } + struct xmlHandler handlers[] = { { "system", ncclTopoXmlLoadSystem } }; + xml->maxIndex = 0; + NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1)); + fclose(file); + return ncclSuccess; +} + +/**********************/ +/* XML creation */ +/* from autodetection */ +/**********************/ + +#define BUSID_SIZE (sizeof("0000:00:00.0")) +#define BUSID_REDUCED_SIZE (sizeof("0000:00")) +static void memcpylower(char* dst, const char* src, const size_t size) { + for (int i=0; i %s=%s\n", path, fileName, attrName, strValue); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml* xml) { + int index; + NCCLCHECK(xmlGetAttrIndex(cpuNode, "affinity", &index)); + if (index == -1) { + const char* numaId; + NCCLCHECK(xmlGetAttr(cpuNode, "numaid", &numaId)); + if (numaId == NULL) { + WARN("GetXmlFromCpu : could not find CPU numa ID."); + return ncclInternalError; + } + // Set affinity + char cpumaskPath[] = "/sys/devices/system/node/node0000"; + sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId); + NCCLCHECK(ncclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity")); + } + + NCCLCHECK(xmlGetAttrIndex(cpuNode, "arch", &index)); + if (index == -1) { + // Fill CPU type / vendor / model +#if defined(__PPC__) + NCCLCHECK(xmlSetAttr(cpuNode, "arch", "ppc64")); +#elif defined(__aarch64__) + NCCLCHECK(xmlSetAttr(cpuNode, "arch", "arm64")); +#elif defined(__x86_64__) + NCCLCHECK(xmlSetAttr(cpuNode, "arch", "x86_64")); +#endif + } + +#if defined(__x86_64__) + NCCLCHECK(xmlGetAttrIndex(cpuNode, "vendor", &index)); + if (index == -1) { + union { + struct { + // CPUID 0 String register order + uint32_t ebx; + uint32_t edx; + uint32_t ecx; + }; + char vendor[12]; + } cpuid0; + + asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0) : "memory"); + char vendor[13]; + strncpy(vendor, cpuid0.vendor, 12); + vendor[12] = '\0'; + NCCLCHECK(xmlSetAttr(cpuNode, "vendor", vendor)); + } + + NCCLCHECK(xmlGetAttrIndex(cpuNode, "familyid", &index)); + if (index == -1) { + union { + struct { + unsigned steppingId:4; + unsigned modelId:4; + unsigned familyId:4; + unsigned processorType:2; + unsigned resv0:2; + unsigned extModelId:4; + unsigned extFamilyId:8; + unsigned resv1:4; + }; + uint32_t val; + } cpuid1; + asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1) : "memory"); + int familyId = cpuid1.familyId + (cpuid1.extFamilyId << 4); + int modelId = cpuid1.modelId + (cpuid1.extModelId << 4); + NCCLCHECK(xmlSetAttrInt(cpuNode, "familyid", familyId)); + NCCLCHECK(xmlSetAttrInt(cpuNode, "modelid", modelId)); + } +#endif + return ncclSuccess; +} + +ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct ncclXmlNode** pciNode) { + NCCLCHECK(xmlFindTagKv(xml, "pci", pciNode, "busid", busId)); + if (*pciNode == NULL) { + NCCLCHECK(xmlAddNode(xml, NULL, "pci", pciNode)); + } + NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId)); + return ncclSuccess; +} + +// Check whether a string is in BDF format or not. +// BDF (Bus-Device-Function) is "BBBB:BB:DD.F" where B, D and F are hex digits. +// There can be trailing chars. +int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); } +int checkBDFFormat(char* bdf) { + if (bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.') return 0; + if (isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) || + isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) || isHex(bdf[9] == 0) || + isHex(bdf[11] == 0)) return 0; + return 1; +} + +ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) { + // Fill info, then parent + const char* busId; + NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); + char* path = NULL; + int index; + NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index)); + if (index == -1) { + if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); + NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class")); + } + NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index)); + if (index == -1) { + if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); + char deviceSpeedStr[MAX_STR_LEN]; + float deviceSpeed; + NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr)); + sscanf(deviceSpeedStr, "%f GT/s", &deviceSpeed); + char portSpeedStr[MAX_STR_LEN]; + float portSpeed; + NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_speed", portSpeedStr)); + sscanf(portSpeedStr, "%f GT/s", &portSpeed); + NCCLCHECK(xmlSetAttr(pciNode, "link_speed", portSpeed < deviceSpeed ? portSpeedStr : deviceSpeedStr)); + } + NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index)); + if (index == -1) { + if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); + char strValue[MAX_STR_LEN]; + NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue)); + int deviceWidth = strtol(strValue, NULL, 0); + NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_width", strValue)); + int portWidth = strtol(strValue, NULL, 0); + NCCLCHECK(xmlSetAttrInt(pciNode, "link_width", std::min(deviceWidth,portWidth))); + } + struct ncclXmlNode* parent = pciNode->parent; + if (parent == NULL) { + if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); + + // Save that for later in case next step is a CPU + char numaIdStr[MAX_STR_LEN]; + NCCLCHECK(ncclTopoGetStrFromSys(path, "numa_node", numaIdStr)); + + // Go up one level in the PCI tree. Rewind two "/" and follow the upper PCI + // switch, or stop if we reach a CPU root complex. + int slashCount = 0; + int parentOffset; + for (parentOffset = strlen(path)-1; parentOffset>0; parentOffset--) { + if (path[parentOffset] == '/') { + slashCount++; + path[parentOffset] = '\0'; + int start = parentOffset - 1; + while (start>0 && path[start] != '/') start--; + // Check whether the parent path looks like "BBBB:BB:DD.F" or not. + if (checkBDFFormat(path+start+1) == 0) { + // This a CPU root complex. Create a CPU tag and stop there. + struct ncclXmlNode* topNode; + NCCLCHECK(xmlFindTag(xml, "system", &topNode)); + NCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr)); + if (parent == NULL) { + NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent)); + NCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr)); + } + } else if (slashCount == 2) { + // Continue on the upper PCI switch + for (int i = strlen(path)-1; i>0; i--) { + if (path[i] == '/') { + NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", path+i+1)); + if (parent == NULL) { + NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent)); + NCCLCHECK(xmlSetAttr(parent, "busid", path+i+1)); + } + break; + } + } + } + } + if (parent) break; + } + pciNode->parent = parent; + parent->subs[parent->nSubs++] = pciNode; + } + if (strcmp(parent->name, "pci") == 0) { + NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); + } else if (strcmp(parent->name, "cpu") == 0) { + NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml)); + } + free(path); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvmlDev, struct ncclXml* xml, struct ncclXmlNode** gpuNodeRet) { + struct ncclXmlNode* gpuNode = NULL; + NCCLCHECK(xmlGetSub(pciNode, "gpu", &gpuNode)); + if (gpuNode == NULL) NCCLCHECK(xmlAddNode(xml, pciNode, "gpu", &gpuNode)); + + int index = -1; + + int dev = -1; + NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index)); + if (index == -1) { + if (nvmlDev == NULL) { + //WARN("No NVML, trying to use CUDA instead"); + const char* busId; + NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); + if (busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess) dev = -1; + } else { + NCCLCHECK(wrapNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev)); + } + NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev)); + } + NCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev)); + if (dev == -1) return ncclSuccess; + + NCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index)); + if (index == -1) { + int cudaMajor, cudaMinor; + if (nvmlDev == NULL) { + hipDeviceProp_t devProp; + CUDACHECK(hipGetDeviceProperties(&devProp, dev)); + cudaMajor = devProp.major; cudaMinor = devProp.minor; + } else { + NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor)); + } + NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor)); + } + int sm; + NCCLCHECK(xmlGetAttrInt(gpuNode, "sm", &sm)); + + struct ncclXmlNode* nvlNode = NULL; + NCCLCHECK(xmlGetSub(pciNode, "nvlink", &nvlNode)); + if (nvlNode == NULL) { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + const char* busId; + NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); + if (busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess) return ncclInternalError; + int deviceCnt; + CUDACHECK(hipGetDeviceCount(&deviceCnt)); + for (int i=0; i 0 && nvmlDev == NULL) { + WARN("No NVML device handle. Skipping nvlink detection.\n"); + maxNvLinks = 0; + } + + for (int l=0; lnSubs; s++) { + struct ncclXmlNode* sub = gpuNode->subs[s]; +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + if (strcmp(sub->name, "xgmi") != 0) continue; +#else + if (strcmp(sub->name, "nvlink") != 0) continue; +#endif + int index; + NCCLCHECK(xmlGetAttrIndex(sub, "tclass", &index)); + if (index == -1) { + const char* busId; + NCCLCHECK(xmlGetAttr(sub, "target", &busId)); + char* path; + NCCLCHECK(getPciPath(busId, &path)); + NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass")); + } + } + *gpuNodeRet = gpuNode; + return ncclSuccess; +} + +ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode) { + struct ncclXmlNode* node; + NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node)); + NCCLCHECK(ncclTopoGetXmlFromSys(node, xml)); + NCCLCHECK(wrapNvmlSymbols()); + NCCLCHECK(wrapNvmlInit()); + nvmlDevice_t nvmlDev; + if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL; + NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode)); + return ncclSuccess; +} + +// Returns the subsystem name of a path, i.e. the end of the path +// where sysPath/subsystem points to. +ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) { + char subSysPath[PATH_MAX]; + sprintf(subSysPath, "%s/subsystem", sysPath); + char* path = realpath(subSysPath, NULL); + if (path == NULL) { + subSys[0] = '\0'; + } else { + int offset; + for (offset = strlen(path); offset > 0 && path[offset] != '/'; offset--); + strcpy(subSys, path+offset+1); + free(path); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode) { + NCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName)); + if (*netNode != NULL) return ncclSuccess; + + const char* pciSysPath = pciPath; + if (pciSysPath) { + char subSystem[PATH_MAX]; + NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem)); + // This is not a PCI device (virtual, usb, ...). + if (strcmp(subSystem, "pci") != 0) { + INFO(NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem); + pciSysPath = NULL; + } + } + + struct ncclXmlNode* parent = NULL; + if (pciSysPath) { + int offset; + for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--); + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + strcpy(busId, pciSysPath+offset+1); + NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", busId)); + if (parent == NULL) { + NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent)); + NCCLCHECK(xmlSetAttr(parent, "busid", busId)); + NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); + } + } else { + // Virtual NIC, no PCI device, attach to first CPU + NCCLCHECK(xmlFindTag(xml, "cpu", &parent)); + } + + struct ncclXmlNode* nicNode = NULL; + NCCLCHECK(xmlGetSub(parent, "nic", &nicNode)); + if (nicNode == NULL) { + NCCLCHECK(xmlAddNode(xml, parent, "nic", &nicNode)); + } + + // We know that this net does not exist yet (we searched for it at the + // beginning of this function), so we can add it. + NCCLCHECK(xmlAddNode(xml, nicNode, "net", netNode)); + NCCLCHECK(xmlSetAttr(*netNode, "name", netName)); + return ncclSuccess; +} + +/**************************************************/ +/* Parser rules for the user-defined graph search */ +/**************************************************/ + +ncclResult_t ncclTopoXmlGraphLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlGraphLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlGraphLoadChannel(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + struct xmlHandler handlers[] = { { "net", ncclTopoXmlGraphLoadNet }, { "gpu", ncclTopoXmlGraphLoadGpu } }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlGraphLoadGraph(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + struct xmlHandler handlers[] = { { "channel", ncclTopoXmlGraphLoadChannel } }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlGraphLoadGraphs(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) { + int version; + NCCLCHECK(xmlGetAttrInt(head, "version", &version)); + if (version != NCCL_GRAPH_XML_VERSION) { + WARN("XML Graph has wrong version %d, %d needed", version, NCCL_GRAPH_XML_VERSION); + return ncclInvalidUsage; + } + const char* name; + NCCLCHECK(xmlGetAttr(head, "name", &name)); + if (name != NULL) INFO(NCCL_GRAPH, "Loading graphs for topology %s", name); + else INFO(NCCL_GRAPH, "Loading graphs"); + + struct xmlHandler handlers[] = { { "graph", ncclTopoXmlGraphLoadGraph } }; + NCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1)); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml) { + FILE* file = fopen(xmlGraphFile, "r"); + if (file == NULL) { + WARN("Could not open XML graph file %s : %s", xmlGraphFile, strerror(errno)); + return ncclSystemError; + } + struct xmlHandler handlers[] = { { "graphs", ncclTopoXmlGraphLoadGraphs } }; + xml->maxIndex = 0; + NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1)); + fclose(file); + return ncclSuccess; +} diff --git a/src/graph/xml.h b/src/graph/xml.h new file mode 100644 index 0000000000..fa04527803 --- /dev/null +++ b/src/graph/xml.h @@ -0,0 +1,237 @@ +/************************************************************************* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef XML_H_ +#define XML_H_ + +// A few constraints to make the implementation easy +#define MAX_STR_LEN 256 +#define MAX_ATTR_COUNT 16 +#define MAX_SUBS 32 +#define MAX_NODES 1024 + +#define NODE_TYPE_NONE 0 +#define NODE_TYPE_OPEN 1 +#define NODE_TYPE_CLOSE 2 +#define NODE_TYPE_SINGLE 3 + +struct ncclXmlNode { + char name[MAX_STR_LEN]; + struct { + char key[MAX_STR_LEN]; + char value[MAX_STR_LEN]; + } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params + int nAttrs; + int type; + struct ncclXmlNode* parent; + struct ncclXmlNode* subs[MAX_SUBS]; + int nSubs; +}; + +struct ncclXml { + struct ncclXmlNode nodes[MAX_NODES]; + int maxIndex; +}; + +/* File functions */ +#define NCCL_TOPO_XML_VERSION 1 +ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml); +ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml); +#define NCCL_GRAPH_XML_VERSION 1 +ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml); + +/* Auto-detect functions */ +ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode); +ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode); + +/**************/ +/* XML Struct */ +/* Functions */ +/**************/ + +static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrName, int* index) { + *index = -1; + const int nAttrs = node->nAttrs; + for (int a=0; aattrs[a].key, attrName, MAX_STR_LEN-1) == 0) { + *index = a; + return ncclSuccess; + } + } + return ncclSuccess; +} + +static ncclResult_t xmlGetAttr(struct ncclXmlNode* node, const char* attrName, const char** value) { + int index; + NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); + *value = index == -1 ? NULL : node->attrs[index].value; + return ncclSuccess; +} + +static ncclResult_t xmlGetAttrStr(struct ncclXmlNode* node, const char* attrName, const char** value) { + NCCLCHECK(xmlGetAttr(node, attrName, value)); + if (*value == NULL) { + WARN("Attribute %s of node %s not found", attrName, node->name); + return ncclInternalError; + } + return ncclSuccess; +} +static ncclResult_t xmlGetAttrInt(struct ncclXmlNode* node, const char* attrName, int* value) { + const char* str; + NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); + *value = strtol(str, NULL, 0); + return ncclSuccess; +} + +static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) { + const char* str; + NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); + *value = strtof(str, NULL); + return ncclSuccess; +} + +static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node) { + *node = NULL; + for (int i=0; imaxIndex; i++) { + struct ncclXmlNode* n = xml->nodes+i; + if (strcmp(n->name, tagName) == 0) { + *node = n; + return ncclSuccess; + } + } + return ncclSuccess; +} + +static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node, const char* attrName, const char* attrValue) { + *node = NULL; + for (int i=0; imaxIndex; i++) { + struct ncclXmlNode* n = xml->nodes+i; + if (strcmp(n->name, tagName) == 0) { + const char* value; + NCCLCHECK(xmlGetAttr(n, attrName, &value)); + if (value && strcmp(value, attrValue) == 0) { + *node = n; + return ncclSuccess; + } + } + } + return ncclSuccess; +} + +static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, const char* value) { + int index; + NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); + if (index == -1) { + index = node->nAttrs++; + strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); + } + strncpy(node->attrs[index].value, value, MAX_STR_LEN); + return ncclSuccess; +} + +static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) { + int index; + NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); + if (index == -1) { + index = node->nAttrs++; + strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); + } + snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value); + return ncclSuccess; +} + +static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) { + int index; + NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); + if (index == -1) { + index = node->nAttrs++; + strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); + } + snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value); + return ncclSuccess; +} + +static ncclResult_t xmlGetSub(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub) { + *sub = NULL; + for (int s=0; snSubs; s++) { + if (strcmp(node->subs[s]->name, subName) == 0) { + *sub = node->subs[s]; + return ncclSuccess; + } + } + return ncclSuccess; +} + +static ncclResult_t xmlGetSubKv(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub, const char* attrName, const char* attrValue) { + *sub = NULL; + for (int s=0; snSubs; s++) { + struct ncclXmlNode* subNode = node->subs[s]; + if (strcmp(subNode->name, subName) == 0) { + const char* value; + NCCLCHECK(xmlGetAttr(subNode, attrName, &value)); + if (value && strcmp(value, attrValue) == 0) { + *sub = node->subs[s]; + return ncclSuccess; + } + } + } + return ncclSuccess; +} +static ncclResult_t xmlGetSubKvInt(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub, const char* attrName, const int attrValue) { + char strValue[10]; + snprintf(strValue, 10, "%d", attrValue); + NCCLCHECK(xmlGetSubKv(node, subName, sub, attrName, strValue)); + return ncclSuccess; +} + +static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent, const char* subName, struct ncclXmlNode** sub) { + if (xml->maxIndex == MAX_NODES) { + WARN("Error : too many XML nodes (max %d)", MAX_NODES); + return ncclInternalError; + } + struct ncclXmlNode* s = xml->nodes+xml->maxIndex++; + s->nSubs = 0; + s->nAttrs = 0; + *sub = s; + s->parent = parent; + if (parent) parent->subs[parent->nSubs++] = s; + strncpy(s->name, subName, MAX_STR_LEN); + return ncclSuccess; +} + +// Dictionary for STR -> INT conversions. No dictionary size information, +// there needs to be a last element with str == NULL. +struct kvDict { + const char* str; + int value; +}; + +static ncclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* dict) { + struct kvDict* d = dict; + while (d->str) { + if (strncmp(str, d->str, strlen(d->str)) == 0) { + *value = d->value; + return ncclSuccess; + } + d++; + } + WARN("KV Convert to int : could not find value of '%s' in dictionary", str); + return ncclInternalError; +} +static ncclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict) { + struct kvDict* d = dict; + while (d->str) { + if (value == d->value) { + *str = d->str; + return ncclSuccess; + } + d++; + } + WARN("KV Convert to str : could not find value %d in dictionary", value); + return ncclInternalError; +} + +#endif diff --git a/src/include/align.h b/src/include/align.h new file mode 100644 index 0000000000..1c9e7aa920 --- /dev/null +++ b/src/include/align.h @@ -0,0 +1,19 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ALIGN_H_ +#define NCCL_ALIGN_H_ + +#define DIVUP(x, y) \ + (((x)+(y)-1)/(y)) + +#define ROUNDUP(x, y) \ + (DIVUP((x), (y))*(y)) + +#define ALIGN_SIZE(size, align) \ + size = ((size + (align) - 1) / (align)) * (align); + +#endif diff --git a/src/include/alloc.h b/src/include/alloc.h index 04a016aaba..8701dead1e 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -10,6 +10,7 @@ #include "nccl.h" #include "checks.h" +#include "align.h" #include static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { @@ -61,4 +62,19 @@ static bool hasFineGrainVramPcie() { else return false; } + +// Allocate memory to be potentially ibv_reg_mr'd. This needs to be +// allocated on separate pages as those pages will be marked DONTFORK +// and if they are shared, that could cause a crash in a child process +static ncclResult_t ncclIbMalloc(void** ptr, size_t size) { + size_t page_size = sysconf(_SC_PAGESIZE); + void* p; + int size_aligned = ROUNDUP(size, page_size); + int ret = posix_memalign(&p, page_size, size_aligned); + if (ret != 0) return ncclSystemError; + memset(p, 0, size); + *ptr = p; + return ncclSuccess; +} + #endif diff --git a/src/include/checks.h b/src/include/checks.h index 0bc6e4ede2..85144a6986 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -57,7 +57,7 @@ ncclResult_t res = call; \ if (res != ncclSuccess) { \ /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ return res; \ } \ } while (0); @@ -66,7 +66,7 @@ res = call; \ if (res != ncclSuccess) { \ /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ goto label; \ } \ } while (0); diff --git a/src/include/coll_net.h b/src/include/coll_net.h new file mode 100644 index 0000000000..3278560d9a --- /dev/null +++ b/src/include/coll_net.h @@ -0,0 +1,34 @@ +/************************************************************************* + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef COLL_NET_H_ +#define COLL_NET_H_ + +#include "nccl.h" +#include "nccl_net.h" + +extern ncclCollNet_t* ncclCollNet; +typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; + +// Translation to external API +static const char* collNetName() { return ncclCollNet->name; } +static ncclResult_t collNetDevices(int* ndev) { NCCLCHECK(ncclCollNet->devices(ndev)); return ncclSuccess; } +static ncclResult_t collNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclCollNet->getProperties(dev, props)); return ncclSuccess; } +static ncclResult_t collNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; } +static ncclResult_t collNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; } +static ncclResult_t collNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; } +static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclCollNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; } +static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; } +static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; } +static ncclResult_t collNetFlush(void* collComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclCollNet->flush(collComm, data, size, mhandle)); return ncclSuccess; } +static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; } +static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; } +static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; } + +static int collNetSupport() { return ncclCollNet != NULL ? 1 : 0; } + +#endif diff --git a/src/include/collectives.h b/src/include/collectives.h index e7baba3f66..26da628ae0 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -1,4 +1,3 @@ -#include "hip/hip_runtime.h" /************************************************************************* * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. @@ -32,7 +31,8 @@ #define DECL_COLL3(coll, op, dtype) \ DECL_COLL4(coll##Ring, op, dtype) \ - DECL_COLL4(coll##Tree, op, dtype) + DECL_COLL4(coll##Tree, op, dtype) \ + DECL_COLL4(coll##CollNet, op, dtype) #define DECL_COLL2(coll, op) \ DECL_COLL3(coll, op, i8) \ diff --git a/src/include/comm.h b/src/include/comm.h index 71548aec73..4be958f0d9 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -24,8 +24,6 @@ struct cudaLaunchParams { #endif #endif -#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ - #define CACHE_LINE_SIZE 64 #define MEM_ALIGN 4096 #define CUDA_IPC_MIN 2097152UL @@ -95,14 +93,11 @@ struct ncclComm { // Channels for collectives int nChannels; - // Only nvlink is used for inter-GPU communication - int nvlink; - // Algorithm/Protocols thresholds ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - int maxThreads[NCCL_NUM_PROTOCOLS]; + int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; // An internal CUDA stream for NCCL kernel CGMD launches int groupCudaStream; @@ -140,6 +135,9 @@ struct ncclComm { // Global proxy thread pthread_t proxyThread; struct ncclProxyState proxyState; + + // Whether this communicator uses collNet + int collNetSupport; }; #endif diff --git a/src/include/core.h b/src/include/core.h index a98e5090ba..908e2de2f0 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -55,9 +55,10 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) { #define NCCL_NUM_FUNCTIONS 5 typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t; -#define NCCL_NUM_ALGORITHMS 2 // Tree/Ring +#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet #define NCCL_ALGO_TREE 0 #define NCCL_ALGO_RING 1 +#define NCCL_ALGO_COLLNET 2 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 #define NCCL_PROTO_LL 0 diff --git a/src/include/cpuset.h b/src/include/cpuset.h index 98b93de87d..40c159410e 100644 --- a/src/include/cpuset.h +++ b/src/include/cpuset.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -19,7 +19,7 @@ static int hexToInt(char c) { #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) -ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) { +ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) { uint32_t cpumasks[CPU_SET_N_U32]; int m = CPU_SET_N_U32-1; cpumasks[m] = 0; diff --git a/src/include/debug.h b/src/include/debug.h index 89b6e42e75..d88458c24a 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -29,11 +29,6 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file // Let code temporarily downgrade WARN into INFO extern thread_local int ncclDebugNoWarn; -#define NOWARN(a, ret) do { \ - ncclDebugNoWarn = 1; \ - ret = a; \ - ncclDebugNoWarn = 0; \ -} while (0) #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) diff --git a/src/include/devcomm.h b/src/include/devcomm.h index a9aeddb8cb..4a90ff5342 100644 --- a/src/include/devcomm.h +++ b/src/include/devcomm.h @@ -10,6 +10,7 @@ #include "nccl.h" #include "rccl_bfloat16.h" +#include "align.h" #include // Convert volatile access to atomic @@ -24,14 +25,6 @@ #define NCCL_MAX_OPS 2048 #define NCCL_STEPS 8 -#define DIVUP(x, y) \ - (((x)+(y)-1)/(y)) -#define ROUNDUP(x, y) \ - (DIVUP((x), (y))*(y)) - -#define ALIGN_SIZE(size, align) \ - size = ((size + (align) - 1) / (align)) * (align); - union ncclLLFifoLine { /* Flags have to be *after* data, because otherwise, an incomplete receive from the network may receive the flag but not the data. @@ -84,6 +77,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) +#define NCCL_DIRECT_GPU 0x01 +#define NCCL_DIRECT_NIC 0x10 + struct ncclConnInfo { // Regular comm mechanism char *buff; // Local for recv, remote for send @@ -190,6 +186,8 @@ struct ncclChannel { struct ncclRing ring; struct ncclTree treeUp; struct ncclTree treeDn; + struct ncclTree collTreeUp; + struct ncclTree collTreeDn; int id; int nthreads; diff --git a/src/include/graph.h b/src/include/graph.h index 3c8ba199ba..b27ea3523c 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -14,17 +14,6 @@ #include #include -enum ncclPathDist { - PATH_PIX = 0, - PATH_PXB = 1, - PATH_PHB = 2, - PATH_NODE = 3, - PATH_SYS = 4, - PATH_ARRAY_SIZE = 5 -}; - -extern const char* pathDists[PATH_ARRAY_SIZE]; - ncclResult_t ncclTopoCudaPath(int cudaDev, char** path); struct ncclTopoSystem; @@ -36,32 +25,47 @@ ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system); ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info); void ncclTopoFree(struct ncclTopoSystem* system); ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); -ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system); // Query topology -ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink); -ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink); -ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance); ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net); -ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance); -ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count); +ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p); +ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); + +// Set CPU affinity +ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank); + +#define NCCL_TOPO_CPU_ARCH_X86 1 +#define NCCL_TOPO_CPU_ARCH_POWER 2 +#define NCCL_TOPO_CPU_ARCH_ARM 3 +#define NCCL_TOPO_CPU_VENDOR_INTEL 1 +#define NCCL_TOPO_CPU_VENDOR_AMD 2 +#define NCCL_TOPO_CPU_TYPE_BDW 1 +#define NCCL_TOPO_CPU_TYPE_SKL 2 +ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); #define NCCL_TOPO_MAX_NODES 256 +// Init search. Needs to be done before calling ncclTopoCompute +ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system); + #define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction #define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Split tree (send/recv from different ranks) flowing in both directions #define NCCL_TOPO_PATTERN_TREE 3 // Simple tree (send/recv from same rank) flowing in both directions #define NCCL_TOPO_PATTERN_RING 4 // Ring struct ncclTopoGraph { // Input / output + int id; // ring : 0, tree : 1, collnet : 2 int pattern; int crossNic; + int collNet; + int minChannels; + int maxChannels; // Output int nChannels; - int speedIntra; - int speedInter; - int type; - int nvlink; + float speedIntra; + float speedInter; + int typeIntra; + int typeInter; int sameChannels; int nHops; int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES]; @@ -70,6 +74,7 @@ struct ncclTopoGraph { ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); +ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs); struct ncclTopoRanks { int ringRecv[MAXCHANNELS]; @@ -83,12 +88,16 @@ struct ncclTopoRanks { }; ncclResult_t ncclTopoPreset(struct ncclComm* comm, - struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, + struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph, struct ncclTopoRanks* topoRanks); ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings); -ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph); +ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank); + +ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph); +#include "info.h" +ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time); #endif diff --git a/src/include/info.h b/src/include/info.h index 6eec677160..9e94fcdc1e 100644 --- a/src/include/info.h +++ b/src/include/info.h @@ -18,7 +18,9 @@ typedef enum { ncclPatternPipelineTo, ncclPatternTreeUp, ncclPatternTreeDown, - ncclPatternTreeUpDown + ncclPatternTreeUpDown, + ncclPatternCollTreeUp, + ncclPatternCollTreeDown } ncclPattern_t; // Used to pass NCCL call information between functions diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h index d6ae9f8557..95dce5b812 100644 --- a/src/include/nccl_net.h +++ b/src/include/nccl_net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,6 +8,7 @@ #define NCCL_NET_H_ #include "nccl.h" +#include #define NCCL_NET_HANDLE_MAXSIZE 64 @@ -20,43 +21,17 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Return the device path in /sys. NCCL will call free on this path. - ncclResult_t (*pciPath)(int dev, char** path); - // Return whether this device supports host pointers and/or CUDA pointers - // as data from the current GPU. Supported types should be composed with - // NCCL_PTR_HOST and NCCL_PTR_CUDA. - ncclResult_t (*ptrSupport)(int dev, int* supportedTypes); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - ncclResult_t (*connect)(int dev, void* handle, void** sendComm); - // Finalize connection establishment after remote peer has called connectHandle - ncclResult_t (*accept)(void* listenComm, void** recvComm); - // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int type, void** request); - // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, void* data, int size, int type, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*flush)(void* recvComm, void* data, int size); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v1_t; + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA + int speed; // Port speed in Mbps. + int port; // Port number. + int maxComms; // Maximum number of comms we can create +}ncclNetProperties_v3_t; + +typedef ncclNetProperties_v3_t ncclNetProperties_t; typedef struct { // Name of the network (mainly for logs) @@ -65,12 +40,8 @@ typedef struct { ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); - // Return the device path in /sys. NCCL will call free on this path. - ncclResult_t (*pciPath)(int dev, char** path); - // Return whether this device supports host pointers and/or CUDA pointers - // as data from the current GPU. Supported types should be composed with - // NCCL_PTR_HOST and NCCL_PTR_CUDA. - ncclResult_t (*ptrSupport)(int dev, int* supportedTypes); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. @@ -99,10 +70,52 @@ typedef struct { ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v2_t; +} ncclNet_v3_t; -typedef ncclNet_v2_t ncclNet_t; +typedef ncclNet_v3_t ncclNet_t; -#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2 +#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v3 + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*flush)(void* collComm, void* data, int size, void* mhandle); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v3_t; + +typedef ncclCollNet_v3_t ncclCollNet_t; + +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v3 #endif // end include guard diff --git a/src/include/net.h b/src/include/net.h index 6a490da658..a6ac5ba327 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -17,7 +17,7 @@ typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; // Translation to external API static const char* ncclNetName() { return ncclNet->name; } static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; } -static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; } +static ncclResult_t ncclNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclNet->getProperties(dev, props)); return ncclSuccess; } static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; } static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; } static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; } @@ -31,33 +31,43 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; } +// Test whether the current GPU support GPU Direct RDMA. #define GPU_BUF_SIZE (2*1024*1024) -static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { - int support; - NCCLCHECK(ncclNet->ptrSupport(dev, &support)); - *supportedTypes = support & ~NCCL_PTR_CUDA; - // The network supports GPU Direct RDMA ; verify the GPU supports it as well. - if (support & NCCL_PTR_CUDA) { +static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) { + int netDevs; + NCCLCHECK(ncclNetDevices(&netDevs)); + *gdrSupport = 0; + for (int dev=0; devgetProperties(dev, &props)); + if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + if (!hasFineGrainVramPcie()) continue; +#endif + + // Allocate memory on the GPU and try to register it on the NIC. void *lComm = NULL, *sComm = NULL, *rComm = NULL; ncclNetHandle_t handle; void* gpuPtr = NULL; void* mHandle = NULL; - ncclResult_t res; - NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), res, cleanup); - NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), res, cleanup); - NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), res, cleanup); - CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), res, cleanup); - NOWARN(ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res); - if (res != ncclSuccess) goto cleanup; - NCCLCHECKGOTO(ncclNetDeregMr(sComm, mHandle), res, cleanup); - NCCLCHECKGOTO(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res, cleanup); - NCCLCHECKGOTO(ncclNetDeregMr(rComm, mHandle), res, cleanup); - *supportedTypes |= NCCL_PTR_CUDA; -cleanup: - if (gpuPtr) hipFree(gpuPtr); - if (rComm) ncclNetCloseRecv(rComm); - if (sComm) ncclNetCloseSend(sComm); - if (lComm) ncclNetCloseListen(lComm); + NCCLCHECK(ncclNetListen(dev, &handle, &lComm)); + NCCLCHECK(ncclNetConnect(dev, &handle, &sComm)); + NCCLCHECK(ncclNetAccept(lComm, &rComm)); + CUDACHECK(hipExtMallocWithFlags(&gpuPtr, GPU_BUF_SIZE, hipDeviceMallocFinegrained)); + ncclDebugNoWarn = NCCL_NET; + if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { + NCCLCHECK(ncclNetDeregMr(sComm, mHandle)); + NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); + NCCLCHECK(ncclNetDeregMr(rComm, mHandle)); + *gdrSupport = 1; + } + ncclDebugNoWarn = 0; + CUDACHECK(hipFree(gpuPtr)); + NCCLCHECK(ncclNetCloseRecv(rComm)); + NCCLCHECK(ncclNetCloseSend(sComm)); + NCCLCHECK(ncclNetCloseListen(lComm)); + break; } return ncclSuccess; } diff --git a/src/include/socket.h b/src/include/socket.h index 96bf5db39d..93760626cb 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -283,6 +283,7 @@ static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char* } static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) { + static int shownIfName = 0; int nIfs = 0; // Allow user to force the INET socket family selection int sock_family = envSocketFamily(); @@ -290,6 +291,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam char* env = getenv("NCCL_SOCKET_IFNAME"); if (env && strlen(env) > 1) { // Specified by user : find or fail + if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env); nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); } else { // Try to automatically pick the right one diff --git a/src/include/transport.h b/src/include/transport.h index 8f9bf0e98f..e25132f39c 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -53,6 +53,8 @@ struct ncclProxyArgs { int nsteps; uint64_t opCount; int protocol; + ncclDataType_t dtype; + ncclRedOp_t redOp; int state; // add component before this line -- it is left out during initialization // Internal state @@ -80,7 +82,7 @@ struct ncclProxyState { struct ncclTransportComm { ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId); - ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*); + ncclResult_t (*connect)(struct ncclConnect*, int nranks, int rank, struct ncclConnector*); ncclResult_t (*free)(void*); ncclResult_t (*proxy)(struct ncclProxyArgs*); }; diff --git a/src/include/utils.h b/src/include/utils.h index 266abca7be..03dcd96b58 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -14,7 +14,7 @@ int ncclCudaCompCap(); // PCI Bus ID <-> int64 conversion functions ncclResult_t int64ToBusId(int64_t id, char* busId); -ncclResult_t busIdToInt64(char* busId, int64_t* id); +ncclResult_t busIdToInt64(const char* busId, int64_t* id); ncclResult_t getBusId(int cudaDev, int64_t *busId); @@ -37,4 +37,6 @@ static long log2i(long n) { return l; } +int busIdToCudaDev(int64_t busId); + #endif diff --git a/src/init.cc b/src/init.cc index f22b867c3f..10bc1d62e7 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -12,11 +12,10 @@ #include "transport.h" #include "group.h" #include "net.h" +#include "coll_net.h" #include "enqueue.h" #include "graph.h" #include "argcheck.h" -#include "cpuset.h" -#include #include #include #include @@ -27,6 +26,7 @@ #include #include #include +#include "graph/topo.h" #define STR2(v) #v #define STR(v) STR2(v) @@ -46,6 +46,7 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); ncclNet_t* ncclNet = NULL; +ncclCollNet_t* ncclCollNet = NULL; // Returns ncclInternalError if anything fails, causing that network to be ignored. ncclResult_t initNet(ncclNet_t* net) { @@ -56,7 +57,15 @@ ncclResult_t initNet(ncclNet_t* net) { return ncclSuccess; } -ncclResult_t initNetPlugin(ncclNet_t** net) { +ncclResult_t initCollNet(ncclCollNet_t* collnet) { + int ndev; + if (collnet->init(ncclDebugLog) != ncclSuccess) return ncclInternalError; + if (collnet->devices(&ndev) != ncclSuccess) return ncclInternalError; + if (ndev <= 0) return ncclSystemError; + return ncclSuccess; +} + +ncclResult_t initNetPlugin(ncclNet_t** net, ncclCollNet_t** collnet) { void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL); if (netPluginLib == NULL) { // dlopen does not guarantee to set errno, but dlerror only gives us a @@ -72,13 +81,17 @@ ncclResult_t initNetPlugin(ncclNet_t** net) { ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL)); if (extNet == NULL) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol."); - goto cleanup; - } - if (initNet(extNet) == ncclSuccess) { + } else if (initNet(extNet) == ncclSuccess) { *net = extNet; + // Check for CollNet + ncclCollNet_t* extCollNet = (ncclCollNet_t*) dlsym(netPluginLib, STR(NCCL_COLLNET_PLUGIN_SYMBOL)); + if (extCollNet == NULL) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_COLLNET_PLUGIN_SYMBOL) " symbol."); + } else if (initCollNet(extCollNet) == ncclSuccess) { + *collnet = extCollNet; + } return ncclSuccess; } -cleanup: if (netPluginLib != NULL) dlclose(netPluginLib); return ncclSuccess; } @@ -87,7 +100,7 @@ ncclResult_t initNet() { // Always initialize bootstrap network NCCLCHECK(bootstrapNetInit()); - NCCLCHECK(initNetPlugin(&ncclNet)); + NCCLCHECK(initNetPlugin(&ncclNet, &ncclCollNet)); if (ncclNet != NULL) return ncclSuccess; if (initNet(&ncclNetIb) == ncclSuccess) { ncclNet = &ncclNetIb; @@ -98,6 +111,8 @@ ncclResult_t initNet() { return ncclSuccess; } +NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0); + pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; static bool initialized = false; static ncclResult_t ncclInit() { @@ -106,6 +121,7 @@ static ncclResult_t ncclInit() { if (!initialized) { initEnv(); initNet(); + INFO(NCCL_INIT, "Using network %s", ncclNetName()); initialized = true; } pthread_mutex_unlock(&initLock); @@ -321,6 +337,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { else comm->hostDevComm.collTraceThread = 0; #endif + comm->collNetSupport = 0; *comret = comm; return ncclSuccess; @@ -334,7 +351,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { // Copy userRanks and peers for (int r=0; rnChannels; r++) { NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks)); - NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks)); + NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks+1)); } // Duplicate the dev comm on the device @@ -374,19 +391,11 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->shmDev = statbuf.st_dev; info->busId = comm->busId; - int netDevs; - NCCLCHECK(ncclNetDevices(&netDevs)); - for (int n=0; ngdrSupport |= (1 << n); - } + NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport)); return ncclSuccess; } -static ncclResult_t setCpuAffinity(int cudaDev); - template static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) { for (int t=0; tcanConnect(&ret, topo, graph, myInfo, peerInfo)); if (ret) { - cpu_set_t affinitySave; - sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - int cudaDev; - CUDACHECK(hipGetDevice(&cudaDev)); - setCpuAffinity(cudaDev); connector->transportComm = transportComm; NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId)); - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); return ncclSuccess; } } @@ -509,7 +512,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnector* conn; for (int i=0; i= comm->nRanks) continue; conn = &channel->peers[peer].recv; if (conn->connected) { ++nSkippedRecv; continue; } memset(&connect, 0, sizeof(connect)); @@ -518,7 +521,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, } for (int i=0; i= comm->nRanks) continue; conn = &channel->peers[peer].send; if (conn->connected) { ++nSkippedSend; continue; } memset(&connect, 0, sizeof(connect)); @@ -527,29 +530,148 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, } for (int i=0; i= comm->nRanks) continue; conn = &channel->peers[peer].send; if (conn->connected) {++nSkippedSend; continue; } memset(&connect, 0, sizeof(connect)); NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); - NCCLCHECK(conn->transportComm->connect(&connect, conn)); + NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn)); conn->connected = 1; } for (int i=0; i= comm->nRanks) continue; conn = &channel->peers[peer].recv; if (conn->connected) {++nSkippedRecv; continue; } memset(&connect, 0, sizeof(connect)); NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); - NCCLCHECK(conn->transportComm->connect(&connect, conn)); + NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn)); conn->connected = 1; } TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv); return ncclSuccess; } +extern struct ncclTransport collNetTransport; + +// All ranks must participate in collNetSetup call +// type: 0 for send, 1 for recv +// return: 0 - unsupported, 1 - supported +static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int collNetChannels, int rank, int nranks, int masterRank, int masterPeer, int nMasters, int type) { + int rankInCollNet = -1; + int supported = 0; + int isMaster = (rank == masterRank) ? 1 : 0; + struct { + int collNetRank; + ncclConnect connect; + } sendrecvExchange; + + // check if we can connect to collnet, whose root is the nranks-th rank + struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks; + peerInfo->rank = nranks; + int ret = 1; + if (isMaster) { + NCCLCHECK(collNetTransport.canConnect(&ret, comm->topo, collNetGraph, myInfo, peerInfo)); + } + + // send master receives connect info from peer recv master + if (isMaster && type == 0) { + NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange))); + rankInCollNet = sendrecvExchange.collNetRank; + INFO(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer); + } + + // select + struct ncclPeer* root = channel->peers+nranks; + struct ncclConnector* conn = (type == 1) ? &root->recv : &root->send; + struct ncclTransportComm* transportComm = (type == 1) ? &(collNetTransport.recv) : &(collNetTransport.send); + conn->transportComm = transportComm; + // setup + struct ncclConnect myConnect; + if (isMaster && ret > 0) { + NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->buffSize, channel->id)); + } + // prepare connect handles + ncclResult_t res; + struct { + int isMaster; + ncclConnect connect; + } *allConnects = NULL; + ncclConnect *masterConnects = NULL; + NCCLCHECK(ncclCalloc(&masterConnects, nMasters)); + if (type == 1) { // recv side: AllGather + // all ranks must participate + NCCLCHECK(ncclCalloc(&allConnects, nranks)); + allConnects[rank].isMaster = isMaster; + memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect)); + NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup); + // consolidate + int c = 0; + for (int r = 0; r < nranks; r++) { + if (allConnects[r].isMaster) { + memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect)); + if (r == rank) rankInCollNet = c; + c++; + } + } + } else { // send side : copy in connect info received from peer recv master + if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect)); + } + // connect + if (isMaster && ret > 0) { + NCCLCHECKGOTO(transportComm->connect(masterConnects, nMasters, rankInCollNet, conn), res, cleanup); + } + // recv side sends connect info to send side + if (isMaster && type == 1) { + sendrecvExchange.collNetRank = rankInCollNet; + memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect)); + NCCLCHECK(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange))); + INFO(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer); + } + if (ret > 0) { + supported = 1; + } +cleanup: + if (allConnects != NULL) free(allConnects); + if (masterConnects != NULL) free(masterConnects); + return supported; +} + +static ncclResult_t checkCollNetSetup(struct ncclComm* comm, int rank, int collNetSetupFail) { + int nranks = comm->nRanks; + // AllGather collNet setup results + int* allGatherFailures; + NCCLCHECK(ncclCalloc(&allGatherFailures, nranks)); + allGatherFailures[rank] = collNetSetupFail; + NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int))); + for (int i=0; inChannels; r++) { + struct ncclChannel* channel = comm->channels+r; + struct ncclPeer* peer = channel->peers+nranks; + if (peer->send.transportResources && peer->send.transportComm) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources)); + if (peer->recv.transportResources && peer->recv.transportComm) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources)); + peer->send.transportResources = NULL; // avoid double free + peer->recv.transportResources = NULL; // avoid double free + } + // Set support to 0 + comm->collNetSupport = 0; + } else { + comm->collNetSupport = 1; + } + return ncclSuccess; +} + NCCL_PARAM(CrossNic, "CROSS_NIC", 2); +NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0); static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) { // We use 3 AllGathers @@ -575,7 +697,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(fillInfo(comm, myInfo, commHash)); NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); - NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks)); + NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root for (int i = 0; i < nranks; i++) { memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) { @@ -594,60 +716,82 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm)); // Recompute paths after trimming NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); - // Compute max speed to accelerate search - NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo)); + // Init search + NCCLCHECK(ncclTopoSearchInit(comm->topo)); // Print final topology NCCLCHECK(ncclTopoPrint(comm->topo)); // Get rings and trees - struct ncclTopoGraph treeGraph; - treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE; - treeGraph.crossNic = ncclParamCrossNic(); - // We communicate only half the data between node with trees on 2 nodes. - NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph)); - NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph)); struct ncclTopoGraph ringGraph; + ringGraph.id = 0; ringGraph.pattern = NCCL_TOPO_PATTERN_RING; ringGraph.crossNic = ncclParamCrossNic(); + ringGraph.collNet = 0; + ringGraph.minChannels = 1; + ringGraph.maxChannels = MAXCHANNELS/2; NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph)); + struct ncclTopoGraph treeGraph; + treeGraph.id = 1; + treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE; + treeGraph.crossNic = ncclParamCrossNic(); + treeGraph.collNet = 0; + treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels; + treeGraph.maxChannels = ringGraph.nChannels; + NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph)); + NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph)); + + struct ncclTopoGraph collNetGraph; + collNetGraph.id = 2; + collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; + collNetGraph.collNet = 1; + collNetGraph.crossNic = ncclParamCrossNic(); + collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels; + NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph)); + NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph)); + + if (comm->rank == ncclParamGraphDumpFileRank()) { + struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph }; + NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs)); + } + // AllGather3 - begin + struct ncclGraphInfo { + int sameChannels; + float speedIntra; + float speedInter; + int typeIntra; + }; struct { int cudaCompCap; int fullCudaCompCap; - int nvlink; int nChannels; - struct { - int sameChannels; - int speedIntra; - int speedInter; - int nvlink; - } tree; - struct { - int sameChannels; - int speedIntra; - int speedInter; - int nvlink; - } ring; + struct ncclGraphInfo tree; + struct ncclGraphInfo ring; + struct ncclGraphInfo collNet; struct ncclTopoRanks topoRanks; } *allGather3Data; NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); allGather3Data[rank].cudaCompCap = ncclCudaCompCap(); - allGather3Data[rank].nvlink = treeGraph.nvlink; - allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); + allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = + std::min(treeGraph.nChannels, ringGraph.nChannels); allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra; allGather3Data[rank].tree.speedInter = treeGraph.speedInter; - allGather3Data[rank].tree.nvlink = treeGraph.nvlink; + allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra; allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels; allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra; allGather3Data[rank].ring.speedInter = ringGraph.speedInter; - allGather3Data[rank].ring.nvlink = ringGraph.nvlink; + allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra; + allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels; + allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra; + allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter; + allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra; - NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks)); + NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks)); NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data))); @@ -675,9 +819,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap); } - comm->nvlink = 1; - for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink; - int nChannelsOrig = comm->nChannels; struct ncclTopoRanks** allTopoRanks; NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks)); @@ -688,11 +829,15 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra); treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter); - treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink); + treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra); ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter); - ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink); + ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); + collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels); + collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra); + collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter); + collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); } if (comm->nChannels < nChannelsOrig) { @@ -705,6 +850,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings)); + if (comm->nNodes > 1 && + ncclParamCollNetEnable() == 1 && + collNetSupport()) { + NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank)); + } free(allTopoRanks); free(nodesFirstRank); @@ -714,7 +864,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels); - NCCLCHECK(ncclSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph)); + NCCLCHECK(ncclTopoSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph)); char line[1024]; line[0]='\0'; @@ -728,21 +878,58 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm line[1023] = '\0'; INFO(NCCL_INIT, "Trees%s", line); + // Set Affinity to a CPU local the our GPU, so that all memory we allocate + // on the host is local. + cpu_set_t affinitySave; + sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); + NCCLCHECK(ncclTopoSetAffinity(comm->topo, comm->rank)); + ncclResult_t ret; + // Connect with prev/next for each ring struct ncclConnect *connect; - NCCLCHECK(ncclCalloc(&connect, 2)); + NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore); for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; - NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks)); + NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore); if (comm->nRanks == 1) continue; - NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next)); - NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up)); - NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down)); + NCCLCHECKGOTO(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore); + NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore); + NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore); + } + + // Check if we can setup CollNet + if (comm->nNodes > 1 && + ncclParamCollNetEnable() == 1 && + collNetSupport()) { + int logicChannels = comm->nChannels/2; + int collNetSetupFail = 0; + const int recvIndex = 0; // recv GPU index is always 0 + const int sendIndex = collNetGraph.pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern + for (int c=0; cchannels+logicChannels+c; + struct ncclChannel* channelSend = comm->channels+c; + NCCLCHECK(p2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down)); + NCCLCHECK(p2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up)); + const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex]; + const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex]; + if (collNetSetup(comm, &collNetGraph, channelRecv, logicChannels, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1) + collNetSetupFail = 1; + if (collNetSetup(comm, &collNetGraph, channelSend, logicChannels, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1) + collNetSetupFail = 1; + } + // Verify CollNet setup across ranks + NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail)); } TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); free(connect); free(rings); + // We should have allocated all buffers, collective fifos, ... we can + // restore the affinity. +affinity_restore: + sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); + if (ret != ncclSuccess) return ret; + // Compute intra ranks (using AllGather1 data) int intraRank0 = -1, intraRank = -1, intraRanks = 0; for (int i = 0; i < nranks; i++) { @@ -771,98 +958,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm return ncclSuccess; } -static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) { - CPU_ZERO_S(sizeof(cpu_set_t), mask); - char* cudaPath; - NCCLCHECK(ncclTopoCudaPath(cudaDev, &cudaPath)); - char path[PATH_MAX]; - strncpy(path, cudaPath, PATH_MAX-1); - snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus"); - path[PATH_MAX-1] = '\0'; - int fd; - SYSCHECKVAL(open(path, O_RDONLY), "open", fd); - char affinityStr[sizeof(cpu_set_t)*2 + 1]; - int r = read(fd, affinityStr, sizeof(cpu_set_t)*2); - if (r > 0) { - affinityStr[r] = '\0'; - NCCLCHECK(ncclStrToCpuset(affinityStr, mask)); - } - close(fd); - free(cudaPath); - return ncclSuccess; -} - -NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); - -static ncclResult_t setCpuAffinity(int cudaDev) { - // Query the CPU affinity set we were provided - cpu_set_t mask; - SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); - -#ifdef ENABLE_TRACE - { - char affinityStr[sizeof(cpu_set_t)*2]; - NCCLCHECK(ncclCpusetToStr(&mask, affinityStr)); - TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr); - } -#endif - - // Find the CPUs that are local to the supplied GPU - cpu_set_t gpuMask; - NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask)); - -#ifdef ENABLE_TRACE - { - char affinityStr[sizeof(cpu_set_t)*2]; - NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr)); - TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr); - } -#endif - - cpu_set_t finalMask; - if (ncclParamIgnoreCpuAffinity()) - // Ignore the CPU affinity set and use the GPU one instead - finalMask = gpuMask; - else - // Use a subset of the GPU affinity set - CPU_AND(&finalMask, &mask, &gpuMask); - - // If there is a non empty set, use it to set affinity - if (CPU_COUNT(&finalMask)) { - char affinityStr[sizeof(cpu_set_t)*2]; - NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); - INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr); - SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity"); - } - return ncclSuccess; -} - ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) { - cpu_set_t affinitySave; - sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - - NCCLCHECK(wrapNvmlSymbols()); - NCCLCHECK(wrapNvmlInit()); - - // Make sure all host memory allocation are close to the GPU - CUDACHECK(hipSetDevice(cudaDev)); - NCCLCHECK(setCpuAffinity(cudaDev)); ncclResult_t res; + CUDACHECK(hipSetDevice(cudaDev)); NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup); - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId); return ncclSuccess; cleanup: if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap); *newcomm = NULL; - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); return res; } diff --git a/src/misc/nvmlwrap_stub.cc b/src/misc/nvmlwrap_stub.cc index 47ea80ca6a..eed704de49 100644 --- a/src/misc/nvmlwrap_stub.cc +++ b/src/misc/nvmlwrap_stub.cc @@ -20,7 +20,7 @@ ncclResult_t wrapNvmlShutdown(void) { } ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { - return ncclSuccess; + return ncclSystemError; } ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { @@ -29,7 +29,7 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { } ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { - return ncclSuccess; + return ncclSystemError; } ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) { @@ -38,17 +38,16 @@ ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* min } ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { - return ncclSuccess; + return ncclSystemError; } ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) { - return ncclSuccess; + return ncclSystemError; } ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult) { - *capResult = 0; - return ncclSuccess; + return ncclSystemError; } ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { diff --git a/src/misc/utils.cc b/src/misc/utils.cc index 262974305f..ecedd764b6 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -26,7 +26,7 @@ ncclResult_t int64ToBusId(int64_t id, char* busId) { return ncclSuccess; } -ncclResult_t busIdToInt64(char* busId, int64_t* id) { +ncclResult_t busIdToInt64(const char* busId, int64_t* id) { const int size = strlen(busId); char* hexStr; NCCLCHECK(ncclCalloc(&hexStr, size)); diff --git a/src/transport.cc b/src/transport.cc index d501021369..d0f29133e6 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -101,6 +101,7 @@ static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) { struct ncclPeer* peerComm = args->channel->peers+peer; struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send; + if (connector->transportComm == NULL) return ncclInternalError; if (connector->transportComm->proxy == NULL) return ncclSuccess; struct ncclProxyArgs* op; @@ -131,6 +132,18 @@ ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int r for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(tree->down[i], args)); NCCLCHECK(SaveProxy(tree->up, args)); } + if (pattern == ncclPatternCollTreeUp) { + // CollTree up + struct ncclTree* tree = &args->channel->collTreeUp; + NCCLCHECK(SaveProxy(tree->down[0], args)); + NCCLCHECK(SaveProxy(tree->up, args)); + } + if (pattern == ncclPatternCollTreeDown) { + // CollTree down + struct ncclTree* tree = &args->channel->collTreeDn; + NCCLCHECK(SaveProxy(tree->down[0], args)); + NCCLCHECK(SaveProxy(tree->up, args)); + } return ncclSuccess; } diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc new file mode 100644 index 0000000000..dc49c6c849 --- /dev/null +++ b/src/transport/coll_net.cc @@ -0,0 +1,431 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "coll_net.h" +#include "graph.h" +#include + +struct collNetRecvConnectInfo { + collNetHandle_t collNetHandle; +}; + +struct collNetSendConnectInfo { + void* collNetComm; + void* mhandle; + void* llMhandle; + struct reqSlot* reqFifo; +}; + +struct ncclLLDataLine { + uint32_t data1; + uint32_t data2; +}; +static_assert(sizeof(struct ncclLLDataLine) == sizeof(union ncclLLFifoLine)>>1, "ncclLLDataLine is not half size of ncclLLFifoLine"); + +struct reqSlot { + volatile void* recvBuff; + volatile int size; +}; + +struct collNetSendResources { + void* collNetSendComm; + struct ncclSendMem* hostSendMem; + struct ncclRecvMem* hostRecvMem; + struct ncclSendMem* devHostSendMem; + struct ncclRecvMem* devHostRecvMem; + struct ncclLLDataLine* llData; + int netDev; + int useGdr; + int buffSize; + void* sendMhandle; + void* llSendMhandle; + void* recvMhandle; + void* llRecvMhandle; + struct ncclRecvMem* devRecvMem; + uint64_t step; + uint64_t llLastCleaning; + struct reqSlot* reqFifo; + int collNetRank; +}; + +struct collNetRecvResources { + void* netListenComm; + void* collNetRecvComm; + struct ncclSendMem* hostSendMem; + struct ncclRecvMem* hostRecvMem; + struct ncclSendMem* devHostSendMem; + struct ncclRecvMem* devHostRecvMem; + struct ncclLLDataLine* llData; + int netDev; + int useGdr; + int buffSize; + void* mhandle; + void* llMhandle; + struct ncclRecvMem* devRecvMem; + uint64_t step; + uint64_t llLastCleaning; + struct reqSlot* reqFifo; + int collNetRank; +}; + +/* Determine if we can communicate with the peer */ +ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + *ret = 1; + return ncclSuccess; +} + +/* Setup send connector, and return connect information for others in the coll communicator to connect to me */ +ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { + struct collNetSendResources* sendResources; + NCCLCHECK(ncclCalloc(&sendResources, 1)); + send->transportResources = sendResources; + + NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &sendResources->netDev)); + NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, sendResources->netDev, 1, &sendResources->useGdr)); + + int sendSize = sizeof(struct ncclSendMem); + NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostSendMem, (void**)&sendResources->devHostSendMem, sendSize)); + + int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + if (sendResources->useGdr) { + NCCLCHECK(ncclCudaCalloc((char**)(&sendResources->devRecvMem), recvSize, true)); + } + NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostRecvMem, (void**)&sendResources->devHostRecvMem, recvSize)); + NCCLCHECK(ncclIbMalloc((void**)&(sendResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine))); + sendResources->buffSize = buffSize; + + INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), sendResources->netDev, + sendResources->useGdr ? "/GDRDMA" : ""); + + return ncclSuccess; +} + +/* Setup recv connector */ +ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { + struct collNetRecvResources* recvResources; + NCCLCHECK(ncclCalloc(&recvResources, 1)); + recv->transportResources = recvResources; + + NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &recvResources->netDev)); + NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, recvResources->netDev, 0, &recvResources->useGdr)); + + int sendSize = sizeof(struct ncclSendMem); + NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostSendMem, (void**)&recvResources->devHostSendMem, sendSize)); + + int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + if (recvResources->useGdr) { + NCCLCHECK(ncclCudaCalloc((char**)(&recvResources->devRecvMem), recvSize, true)); + } + NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostRecvMem, (void**)&recvResources->devHostRecvMem, recvSize)); + NCCLCHECK(ncclIbMalloc((void**)&(recvResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine))); + recvResources->buffSize = buffSize; + + INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), recvResources->netDev, + recvResources->useGdr ? "/GDRDMA" : ""); + + struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; + NCCLCHECK(collNetListen(recvResources->netDev, &info->collNetHandle, &recvResources->netListenComm)); + + return ncclSuccess; +} + +ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) { + // Setup device pointers + struct collNetSendResources* sendResources = (struct collNetSendResources*)send->transportResources; + sendResources->collNetRank = rank; + + // Get info from recv side + struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank); + sendResources->reqFifo = sInfo->reqFifo; + sendResources->collNetSendComm = sInfo->collNetComm; + sendResources->recvMhandle = sInfo->mhandle; + sendResources->llRecvMhandle = sInfo->llMhandle; + + // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host + struct ncclRecvMem* sRecvMem = sendResources->useGdr ? sendResources->devRecvMem : sendResources->devHostRecvMem; + // Register buffers + NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sRecvMem->buff, sendResources->buffSize, + sendResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &sendResources->sendMhandle)); + NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sendResources->llData, + NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &sendResources->llSendMhandle)); + + send->conn.buff = sRecvMem->buff; + send->conn.llBuff = sendResources->devHostRecvMem->llBuff; + send->conn.direct |= sendResources->useGdr ? NCCL_DIRECT_NIC : 0; + + // Head/Tail/Opcount/Fifos are always on host + send->conn.tail = &sendResources->devHostRecvMem->tail; + send->conn.opCountRem = &sendResources->devHostRecvMem->opCount; + send->conn.fifo = sendResources->devHostRecvMem->sizesFifo; + send->conn.head = &sendResources->devHostSendMem->head; + send->conn.opCountLoc = &sendResources->devHostSendMem->opCount; + for (int i=0; iconn.fifo[i] = -1; + + return ncclSuccess; +} + +ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) { + // Setup device pointers + struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recv->transportResources; + struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank); + recvResources->collNetRank = rank; + + // Intermediate buffering on GPU for GPU Direct RDMA + struct ncclRecvMem* rRecvMem = recvResources->useGdr ? recvResources->devRecvMem : recvResources->devHostRecvMem; + recv->conn.buff = rRecvMem->buff; + recv->conn.llBuff = recvResources->devHostRecvMem->llBuff; // recv LL buff always on host + recv->conn.direct |= recvResources->useGdr ? NCCL_DIRECT_NIC : 0; + + // Head/Tail/Opcount are always on host + recv->conn.tail = &recvResources->devHostRecvMem->tail; + recv->conn.opCountLoc = &recvResources->devHostRecvMem->opCount; + recv->conn.head = &recvResources->devHostSendMem->head; + recv->conn.opCountRem = &recvResources->devHostSendMem->opCount; + + // Connect to coll comm + collNetHandle_t** handlePtrs = NULL; + NCCLCHECK(ncclCalloc(&handlePtrs, nranks)); + for (int i = 0; i < nranks; i++) { + struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i); + handlePtrs[i] = &(info->collNetHandle); + } + ncclResult_t res; + NCCLCHECKGOTO(collNetConnect((void**)handlePtrs, nranks, rank, recvResources->netListenComm, &recvResources->collNetRecvComm), res, cleanup); + + // Register buffers + NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, rRecvMem->buff, recvResources->buffSize, + recvResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &recvResources->mhandle)); + NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, recvResources->llData, + NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &recvResources->llMhandle)); + + // Create shared info between send and recv proxies + NCCLCHECK(ncclCalloc(&(recvResources->reqFifo), NCCL_STEPS)); + + // Pass info to send side + sInfo->reqFifo = recvResources->reqFifo; + sInfo->collNetComm = recvResources->collNetRecvComm; + sInfo->mhandle = recvResources->mhandle; + sInfo->llMhandle = recvResources->llMhandle; + +cleanup: + if (handlePtrs != NULL) free(handlePtrs); + // Close listen comm + NCCLCHECK(collNetCloseListen(recvResources->netListenComm)); + + return res; +} + +ncclResult_t collNetSendFree(void* sendTransportResources) { + struct collNetSendResources* sendResources = (struct collNetSendResources*)sendTransportResources; + NCCLCHECK(ncclCudaHostFree(sendResources->hostSendMem)); + NCCLCHECK(ncclCudaHostFree(sendResources->hostRecvMem)); + if (sendResources->collNetSendComm) { + NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->sendMhandle)); + NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->llSendMhandle)); + } + if (sendResources->useGdr) + CUDACHECK(hipFree(sendResources->devRecvMem)); + free(sendResources->llData); + free(sendResources); + return ncclSuccess; +} + +ncclResult_t collNetRecvFree(void* recvTransportResources) { + struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recvTransportResources; + NCCLCHECK(ncclCudaHostFree(recvResources->hostSendMem)); + if (recvResources->collNetRecvComm) { + NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->mhandle)); + NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->llMhandle)); + } + NCCLCHECK(ncclCudaHostFree(recvResources->hostRecvMem)); + if (recvResources->useGdr) + CUDACHECK(hipFree(recvResources->devRecvMem)); + free(recvResources->llData); + free(recvResources->reqFifo); + + // Make sure SendFree is called before RecvFree + if (recvResources->collNetRecvComm) { + NCCLCHECK(collNetCloseColl(recvResources->collNetRecvComm)); + } + free(recvResources); + return ncclSuccess; +} + +ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { + if (args->protocol == NCCL_PROTO_LL128) { + WARN("CollNet does not support LL128"); + return ncclInternalError; + } + struct collNetSendResources* resources = (struct collNetSendResources*) (args->connector->transportResources); + if (args->state == ncclProxyOpReady) { + // Update opCount + STORE(&resources->hostRecvMem->opCount, args->opCount); + + // Round to next multiple of sliceSteps + resources->step = ROUNDUP(resources->step, args->chunkSteps); + args->head = resources->step; + args->tail = resources->step; + args->end = args->head + args->nsteps; + args->state = ncclProxyOpProgress; + } + if (args->state == ncclProxyOpProgress) { + args->idle = 1; + struct reqSlot* reqFifo = resources->reqFifo; + if (args->head < args->end) { + int buffSlot = args->tail%NCCL_STEPS; + if (args->tail < args->end && args->tail < args->head + NCCL_STEPS + && reqFifo[buffSlot].recvBuff != NULL) { + volatile int* sizesFifo = resources->hostRecvMem->sizesFifo; + volatile uint64_t* recvTail = &resources->hostRecvMem->tail; + if (args->protocol == NCCL_PROTO_LL) { + int size = LOAD(sizesFifo+buffSlot); + if (size != -1) { + uint32_t flag = NCCL_LL_FLAG(args->tail + 1); + int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); + union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES; + int ready = 1; + for (int i=0; illData+buffSlot*NCCL_LL_SLICE_LINES; + for (int i=0; idtype); + NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->llSendMhandle, resources->llRecvMhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce (LL) posted, req %p", args->head, buffSlot, args->requests[buffSlot]); + STORE(sizesFifo+buffSlot, -1); + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + args->tail += args->sliceSteps; + args->idle = 0; + } + } + } + } else if (args->tail < LOAD(recvTail)) { + int stepSize = args->channel->buffSize/NCCL_STEPS; + struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; + // Send through network + if (LOAD(sizesFifo+buffSlot) != -1) { + int count = LOAD(sizesFifo+buffSlot)/ncclTypeSize(args->dtype); + NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localMem->buff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->sendMhandle, resources->recvMhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p count %d", args->head, buffSlot, args->requests[buffSlot], count); + STORE(sizesFifo+buffSlot, -1); + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + args->tail += args->sliceSteps; + args->idle = 0; + } + } + } + } + if (args->head < args->tail) { + int done, size; + int buffSlot = args->head%NCCL_STEPS; + NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size)); + if (done) { + TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->head, buffSlot, args->requests[buffSlot], size); + reqFifo[buffSlot].size = size; + // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush) + // (reordered store after store is possible on POWER, though not on x86) + __sync_synchronize(); + reqFifo[buffSlot].recvBuff = NULL; // Notify recvProxy + args->head += args->sliceSteps; + STORE(&resources->hostSendMem->head, args->head); + args->idle = 0; + } + } + } + if (args->head == args->end) { + resources->step = args->end; + args->idle = 0; + args->state = ncclProxyOpNone; + } + } + return ncclSuccess; +} + +ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { + if (args->protocol == NCCL_PROTO_LL128) { + WARN("CollNet does not support LL128"); + return ncclInternalError; + } + struct collNetRecvResources* resources = (struct collNetRecvResources*) (args->connector->transportResources); + if (args->state == ncclProxyOpReady) { + // Update opCount + STORE(&resources->hostSendMem->opCount, args->opCount); + + // Round to next multiple of sliceSteps + resources->step = ROUNDUP(resources->step, args->chunkSteps); + args->head = resources->step; + args->tail = resources->step; + args->end = args->head + args->nsteps; + args->state = ncclProxyOpProgress; + } + if (args->state == ncclProxyOpProgress) { + args->idle = 1; + int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine) : args->channel->buffSize ) / NCCL_STEPS; + struct reqSlot* reqFifo = resources->reqFifo; + if (args->head < args->end) { + struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; + char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)resources->llData : localMem->buff; + void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : resources->mhandle; + if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) { + int buffSlot = args->tail%NCCL_STEPS; + reqFifo[buffSlot].recvBuff = localBuff+buffSlot*stepSize; + TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, localBuff+buffSlot*stepSize); + args->tail += args->sliceSteps; + args->idle = 0; + } + if (args->tail > args->head) { + int buffSlot = args->head%NCCL_STEPS; + if (reqFifo[buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete + TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->head, buffSlot, reqFifo[buffSlot].size); + args->head += args->sliceSteps; + if (args->protocol == NCCL_PROTO_LL) { // ll + // re-attach flag + uint32_t flag = args->head; + union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(resources->hostRecvMem->llBuff)+buffSlot*NCCL_LL_SLICE_LINES; + struct ncclLLDataLine* recvData = resources->llData+buffSlot*NCCL_LL_SLICE_LINES; + int nFifoLines = DIVUP(reqFifo[buffSlot].size, sizeof(struct ncclLLDataLine)); + for (int i=0; iprotocol == NCCL_PROTO_SIMPLE) { + if (resources->useGdr) collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle); + STORE(&resources->hostRecvMem->tail, args->head); + } + args->idle = 0; + } + } + } + if (args->head == args->end) { + resources->step = args->end; + args->idle = 0; + args->state = ncclProxyOpNone; + } + } + return ncclSuccess; +} + +struct ncclTransport collNetTransport = { + "COL", + collNetCanConnect, + { collNetSendSetup, collNetSendConnect, collNetSendFree, collNetSendProxy }, + { collNetRecvSetup, collNetRecvConnect, collNetRecvFree, collNetRecvProxy } +}; diff --git a/src/transport/net.cc b/src/transport/net.cc index 320a0e0bc0..f9c586b894 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -55,52 +55,6 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop return ncclSuccess; } -NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); -NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB); - -static ncclResult_t netGetGdrSupport(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr) { - *useGdr = 0; - - int cudaDev; - CUDACHECK(hipGetDevice(&cudaDev)); - - if (!hasFineGrainVramPcie()) { - INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / Need Fine Grain VRAM over PCIe", ncclNetName(), cudaDev); - return ncclSuccess; - } - - if (read) { // For reads (sends) only enable under certain conditions - int gdrReadParam = ncclParamNetGdrRead(); - if (gdrReadParam == 0) return ncclSuccess; -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - return ncclSuccess; -#else - if (gdrReadParam < 0) { - int nvlink; - NCCLCHECK(ncclTopoHasNvlink(topo, busId, &nvlink)); - if (!nvlink) return ncclSuccess; - } -#endif - } - - // Check if we are close enough that it makes sense to enable GDR - int netGdrLevel = ncclParamNetGdrLevel(); - int distance; - NCCLCHECK(ncclTopoNetDistance(topo, busId, netDev, &distance)); - if (distance >= netGdrLevel) { - INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), busId, netDev, distance, netGdrLevel); - return ncclSuccess; - } - - // Finally, check if the NIC supports it - int flags; - NCCLCHECK(ncclNetPtrSupport(netDev, &flags)); - if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess; - *useGdr = 1; - INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d < %d), read %d", ncclNetName(), busId, netDev, distance, netGdrLevel, read); - return ncclSuccess; -} - /* Determine if we will use this transport for this peer and return connect * information for this peer */ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { @@ -109,7 +63,7 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra send->transportResources = resources; NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev)); - NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr)); + NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr)); int sendSize = sizeof(struct ncclSendMem); NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); @@ -132,7 +86,7 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra recv->transportResources = resources; NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev)); - NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr)); + NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr)); int sendSize = sizeof(struct ncclSendMem); NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); @@ -151,7 +105,7 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra return ncclSuccess; } -ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { +ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { // Setup device pointers struct netSendResources* resources = (struct netSendResources*)send->transportResources; @@ -160,6 +114,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto send->conn.buff = recvMem->buff; send->conn.llBuff = resources->devHostRecvMem->llBuff; send->conn.ll128Buff = recvMem->ll128Buff; + send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; // Head/Tail/Opcount/Fifos are always on host send->conn.tail = &resources->devHostRecvMem->tail; @@ -184,7 +139,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto } /* Connect to this peer */ -ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { +ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { // Setup device pointers struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources; @@ -193,6 +148,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto recv->conn.buff = recvMem->buff; recv->conn.llBuff = recvMem->llBuff; recv->conn.ll128Buff = recvMem->ll128Buff; + recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; // Head/Tail/Opcount are always on host recv->conn.tail = &resources->devHostRecvMem->tail; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 972dd86712..edf57b9f36 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -29,13 +29,19 @@ #define MAXNAMESIZE 64 static char ncclIbIfName[MAX_IF_NAME_SIZE]; static union socketAddress ncclIbIfAddr; + static int ncclNIbDevs = -1; struct ncclIbDev { int device; + uint64_t guid; uint8_t port; uint8_t link; + int speed; ibv_context* context; char devName[MAXNAMESIZE]; + char* pciPath; + int realPort; + int maxQp; }; #define MAX_IB_PORT 15 @@ -54,20 +60,7 @@ NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14); NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7); NCCL_PARAM(IbSl, "IB_SL", 0); NCCL_PARAM(IbTc, "IB_TC", 0); - -// Allocate memory to be potentially ibv_reg_mr'd. This needs to be -// allocated on separate pages as those pages will be marked DONTFORK -// and if they are shared, that could cause a crash in a child process -static ncclResult_t ncclIbMalloc(void** ptr, size_t size) { - size_t page_size = sysconf(_SC_PAGESIZE); - void* p; - int size_aligned = ROUNDUP(size, page_size); - int ret = posix_memalign(&p, page_size, size_aligned); - if (ret != 0) return ncclSystemError; - memset(p, 0, size); - *ptr = p; - return ncclSuccess; -} +NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192); pthread_t ncclIbAsyncThread; static void* ncclIbAsyncThreadMain(void* args) { @@ -86,6 +79,39 @@ static void* ncclIbAsyncThreadMain(void* args) { NCCL_PARAM(IbDisable, "IB_DISABLE", 0); +static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) { + char devicePath[PATH_MAX]; + snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName); + char* p = realpath(devicePath, NULL); + if (p == NULL) { + WARN("Could not find real path of %s", *devicePath); + } else { + // Merge multi-port NICs into the same PCI device + p[strlen(p)-1] = '0'; + // And keep the real port aside (the ibv port is always 1 on recent cards) + *realPort = 0; + for (int d=0; dname, port, portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); ncclIbDevs[ncclNIbDevs].device = d; + ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid; ncclIbDevs[ncclNIbDevs].port = port; ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; + ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width); ncclIbDevs[ncclNIbDevs].context = context; strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); + NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort)); + ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp; ncclNIbDevs++; nPorts++; pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); @@ -181,17 +211,6 @@ ncclResult_t ncclIbDevices(int* ndev) { return ncclSuccess; } -ncclResult_t ncclIbPciPath(int dev, char** path) { - char devicepath[PATH_MAX]; - snprintf(devicepath, PATH_MAX, "/sys/class/infiniband/%s/device", ncclIbDevs[dev].devName); - *path = realpath(devicepath, NULL); - if (*path == NULL) { - WARN("Could not find real path of %s", devicepath); - return ncclSystemError; - } - return ncclSuccess; -} - // Detect whether GDR can work on a given NIC with the current CUDA device // Returns : // ncclSuccess : GDR works @@ -209,19 +228,24 @@ ncclResult_t ncclIbGdrSupport(int ibDev) { return ncclSuccess; } -ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) { - *supportedTypes = NCCL_PTR_HOST; - - if (ncclIbGdrSupport(dev) != ncclSuccess) { - INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName); - return ncclSuccess; - } - *supportedTypes |= NCCL_PTR_CUDA; +static ncclResult_t GetSocketAddr(union socketAddress* addr) { + memcpy(addr, &ncclIbIfAddr, sizeof(*addr)); return ncclSuccess; } -static ncclResult_t GetSocketAddr(union socketAddress* addr) { - memcpy(addr, &ncclIbIfAddr, sizeof(*addr)); +ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) { + props->name = ncclIbDevs[dev].devName; + props->pciPath = ncclIbDevs[dev].pciPath; + props->guid = ncclIbDevs[dev].guid; + props->ptrSupport = NCCL_PTR_HOST; + if (ncclIbGdrSupport(dev) != ncclSuccess) { + INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName); + } else { + props->ptrSupport |= NCCL_PTR_CUDA; + } + props->speed = ncclIbDevs[dev].speed; + props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort; + props->maxComms = ncclIbDevs[dev].maxQp; return ncclSuccess; } @@ -330,7 +354,8 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce qpInitAttr.send_cq = verbs->cq; qpInitAttr.recv_cq = verbs->cq; qpInitAttr.qp_type = IBV_QPT_RC; - qpInitAttr.cap.max_send_wr = MAX_REQUESTS; + // We might send 2 requests per send (RDMA_WRITE+RDMA_WRITE_WITH_IMM) + qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS; qpInitAttr.cap.max_recv_wr = MAX_REQUESTS; qpInitAttr.cap.max_send_sge = 1; qpInitAttr.cap.max_recv_sge = 1; @@ -632,6 +657,10 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo wr.opcode = IBV_WR_SEND; wr.send_flags = IBV_SEND_SIGNALED; + int useAr = 0; + if (size > ncclParamIbArThreshold()) { + useAr = 1; + } #if USE_RDMA_WRITE __sync_synchronize(); // order the readyPtr load against rkey load below // Sanity checks to catch user collective call count/size mismatches @@ -641,7 +670,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo size, LOAD(&slot->size), LOAD(&slot->addr), LOAD(&slot->rkey), LOAD(&slot->seq), comm->fifoHead); return ncclInternalError; } - wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wr.opcode = useAr ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_WRITE_WITH_IMM; wr.wr.rdma.remote_addr = LOAD(&slot->addr); wr.wr.rdma.rkey = LOAD(&slot->rkey); wr.imm_data = size; // Send the message size via imm_data @@ -656,6 +685,19 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo struct ibv_send_wr* bad_wr; NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr)); + +#if USE_RDMA_WRITE + // When using adaptive routing, send the bulk of the data first as an + // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote + // completion. + if (useAr) { + wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wr.sg_list = NULL; + wr.num_sge = 0; + wr.send_flags &= ~IBV_SEND_SIGNALED; + NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr)); + } +#endif *request = req; return ncclSuccess; } @@ -840,8 +882,7 @@ ncclNet_t ncclNetIb = { "IB", ncclIbInit, ncclIbDevices, - ncclIbPciPath, - ncclIbPtrSupport, + ncclIbGetProperties, ncclIbListen, ncclIbConnect, ncclIbAccept, diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index d6ca0b50b3..6bba7bee70 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -20,16 +20,31 @@ #include /* Init functions */ -static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; -static union socketAddress ncclNetIfAddrs[MAX_IFS]; static int ncclNetIfs = -1; +struct ncclSocketDev { + union socketAddress addr; + char devName[MAX_IF_NAME_SIZE]; + char* pciPath; +}; +static struct ncclSocketDev ncclSocketDevs[MAX_IFS]; + pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER; +static ncclResult_t ncclSocketGetPciPath(char* devName, char** pciPath) { + char devicePath[PATH_MAX]; + snprintf(devicePath, PATH_MAX, "/sys/class/net/%s/device", devName); + // May return NULL if the file doesn't exist. + *pciPath = realpath(devicePath, NULL); + return ncclSuccess; +} + ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { if (ncclNetIfs == -1) { pthread_mutex_lock(&ncclSocketLock); if (ncclNetIfs == -1) { - ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); + char names[MAX_IF_NAME_SIZE*MAX_IFS]; + union socketAddress addrs[MAX_IFS]; + ncclNetIfs = findInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS); if (ncclNetIfs <= 0) { WARN("NET/Socket : no interface found"); return ncclInternalError; @@ -38,8 +53,11 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { char addrline[1024]; line[0] = '\0'; for (int i=0; i 0) { + *speed = strtol(speedStr, NULL, 0); + } + close(fd); } + if (*speed <= 0) { + INFO(NCCL_NET, "Could not get speed from %s. Defaulting to 10 Gbps.", speedPath); + *speed = 10000; + } + return ncclSuccess; +} + +ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) { + props->name = ncclSocketDevs[dev].devName; + props->pciPath = ncclSocketDevs[dev].pciPath; + props->guid = dev; + props->ptrSupport = NCCL_PTR_HOST; + NCCLCHECK(ncclSocketGetSpeed(props->name, &props->speed)); + props->port = 0; + props->maxComms = 65536; return ncclSuccess; } ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { if (dev >= ncclNetIfs) return ncclInternalError; - memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr)); + memcpy(addr, &ncclSocketDevs[dev].addr, sizeof(*addr)); return ncclSuccess; } @@ -197,7 +229,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) { // Auto-detection int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads char vendorPath[PATH_MAX]; - snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE); + snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclSocketDevs[dev].devName); char* rPath = realpath(vendorPath, NULL); int fd = open(rPath, O_RDONLY); free(rPath); @@ -487,8 +519,7 @@ ncclNet_t ncclNetSocket = { "Socket", ncclSocketInit, ncclSocketDevices, - ncclSocketPciPath, - ncclSocketPtrSupport, + ncclSocketGetProperties, ncclSocketListen, ncclSocketConnect, ncclSocketAccept, diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index a263408581..3cf1658230 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -47,9 +47,6 @@ struct p2pRecvResources { #include -NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2); -NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2); - /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ int busIdToCudaDev(int64_t busId) { int ndev; @@ -69,96 +66,51 @@ int busIdToCudaDev(int64_t busId) { /* Determine if two peers can communicate through p2p */ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { - int cpuCount; - NCCLCHECK(ncclTopoCpuCount(topo, &cpuCount)); #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - int p2pLevel = PATH_ARRAY_SIZE; -#else - // Do not use P2P across sockets by default (provided CUDA permits it). - // When we are on a single socket, don't even use P2P through the CPU as - // it should be able to sustain two flows to sysmem faster than PCI P2P. - int p2pLevel = cpuCount == 1 ? PATH_PHB : PATH_NODE; + if (!hasFineGrainVramPcie()) { + *ret = 0; + return ncclSuccess; + } #endif - if (ncclParamP2pDisable() == 1) p2pLevel = 0; - if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel(); - - // Disable P2P - *ret = 0; - -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - if (!hasFineGrainVramPcie()) return ncclSuccess; -#endif - - if (p2pLevel == 0) return ncclSuccess; // Rule out different nodes - if (info1->hostHash != info2->hostHash) return ncclSuccess; + if (info1->hostHash != info2->hostHash) { + *ret = 0; + return ncclSuccess; + } + + // Check topology / p2p level. + NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret)); + if (*ret == 0) return ncclSuccess; // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) int cudaDev1 = busIdToCudaDev(info1->busId); int cudaDev2 = busIdToCudaDev(info2->busId); if (cudaDev1 == -1 || cudaDev2 == -1) { - // Peer's CUDA device is not visible in this process #if CUDART_VERSION >= 10010 - // But in CUDA 10.1 we can still communicate with 'invisible' devices - TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %lx and %lx", info1->busId, info2->busId); - // Check for NVLink/NVswitch including P2P access - int nvlink; - NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink)); - if (nvlink > 0) { - *ret = 1; - return ncclSuccess; - } + // CUDA 10.1 and later can use P2P with invisible devices. + return ncclSuccess; +#else + // Peer's CUDA device is not visible in this process : we can't communicate with it. + *ret = 0; + return ncclSuccess; #endif - return ncclSuccess; } - TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%lx] and [%d=%lx]", cudaDev1, info1->busId, cudaDev2, info2->busId); - - // Do not detect topology if we're on the same GPU. Note this is not really supported. - if (cudaDev1 == cudaDev2) { - *ret = 1; - return ncclSuccess; - } - - // See if CUDA can do P2P + // Check that CUDA can do P2P int p2p; if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess) { INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)", cudaDev1, info1->busId, cudaDev2, info2->busId); + *ret = 0; return ncclSuccess; } - if (p2p == 0) return ncclSuccess; - - int nvlink = 0; -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - uint32_t link_type, hops; - if (hipExtGetLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) != hipSuccess) { - p2p = 0; + if (p2p == 0) { + INFO(NCCL_INIT|NCCL_P2P,"Could not enable P2P between dev %d(=%lx) and dev %d(=%lx)", + cudaDev1, info1->busId, cudaDev2, info2->busId); + *ret = 0; return ncclSuccess; } - static const char* link_type_name[] = {"HT", "QPI", "PCIE", "IB", "XGMI"}; - static unsigned long long link_status_print_once_mask = 0; - if (!(link_status_print_once_mask & (1 << (cudaDev1*8 + cudaDev2)))) { - INFO(NCCL_INIT, "%d -> %d: link type %s hops %d", cudaDev1, cudaDev2, - link_type_name[link_type], hops); - link_status_print_once_mask |= (1 << (cudaDev1*8 + cudaDev2)); - } - if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) - nvlink = (hops == 1); -#else // Check for NVLink/NVswitch - NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink)); -#endif - if (nvlink > 0) { - *ret = 1; - return ncclSuccess; - } - // Finally compute the PCI distance and compare with the p2pLevel. - int distance; - NCCLCHECK(ncclTopoGpuDistance(topo, info1->busId, info2->busId, &distance)); - if (distance < p2pLevel) { - *ret = 1; - } return ncclSuccess; } @@ -301,13 +253,13 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra } /* Connect/Send to this peer */ -static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { +static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources; struct ncclRecvMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; if (info->direct) { remDevMem = (struct ncclRecvMem*)(info->directPtr); - send->conn.direct = 1; + send->conn.direct |= NCCL_DIRECT_GPU; } else { //TRACE_DUMP_IPC(&info->devIpc); hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess); @@ -339,13 +291,13 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC } /* Connect/Recv from this peer */ -ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { +ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; struct ncclSendMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; if (info->direct) { remDevMem = (struct ncclSendMem*)(info->directPtr); - recv->conn.direct = 1; + recv->conn.direct |= NCCL_DIRECT_GPU; recv->conn.ptrExchange = &remDevMem->ptrExchange; } else { //TRACE_DUMP_IPC(&info->devIpc); diff --git a/src/transport/shm.cc b/src/transport/shm.cc index 60f16c802d..0b1d8eec6d 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -104,7 +104,7 @@ ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra } /* Connect to this peer */ -ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { +ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { // Setup device pointers struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; @@ -129,7 +129,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto return ncclSuccess; } -ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { +ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { // Setup device pointers struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; diff --git a/tools/topo_expl/Makefile b/tools/topo_expl/Makefile index e19ded6d27..a2dc9704d6 100644 --- a/tools/topo_expl/Makefile +++ b/tools/topo_expl/Makefile @@ -8,7 +8,8 @@ HIPCC = $(HIP_PATH)/bin/hipcc EXE = topo_expl CXXFLAGS = -g -O3 -Iinclude -I../../src/include -I../../src/graph/ -DTOPO_EXPL -DENABLE_TRACE -files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc ../../src/graph/search.cc ../../src/graph/connect.cc +files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc \ + ../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc all: $(EXE) diff --git a/tools/topo_expl/include/model.h b/tools/topo_expl/include/model.h index 116febad1d..3b02c55ea2 100644 --- a/tools/topo_expl/include/model.h +++ b/tools/topo_expl/include/model.h @@ -23,397 +23,114 @@ THE SOFTWARE. #ifndef MODEL_H_ #define MODEL_H_ -class CpuDevices { -private: - char *cpuName; - int interCpuWidth; - int cpuPciWidth; - int p2pPciWidth; - -public: - CpuDevices(const char *cpuname, const int intercpuwidth, const int cpupciwidth, const int p2ppciwidth) : - cpuName((char *)cpuname), interCpuWidth(intercpuwidth), cpuPciWidth(cpupciwidth), p2pPciWidth(p2ppciwidth) {} - - CpuDevices() : cpuName(0), interCpuWidth(0), cpuPciWidth(0), p2pPciWidth(0) {} - - ncclResult_t getCpuWidths(char* name, int* interCpu, int* cpuPci, int* p2pPci) { - strcpy(name, cpuName); - *interCpu = interCpuWidth; - *cpuPci = cpuPciWidth; - *p2pPci = p2pPciWidth; - return ncclSuccess; - } -}; - -class GpuDevices { -private: - int nGpus; - uint64_t *busIds; - char **gpuPciPaths; - int *gpuNumaIds; - int *connMatrix; - -public: - GpuDevices(const int ngpus, const uint64_t *busids, const char **gpupcipaths, const int *gpunumaids, const int *connmatrix) : - nGpus(ngpus), busIds((uint64_t *)busids), gpuPciPaths((char **)gpupcipaths), gpuNumaIds((int *)gpunumaids), connMatrix((int *)connmatrix) {} - - GpuDevices () : nGpus(0), busIds(0), gpuPciPaths(0), gpuNumaIds(0), connMatrix(0) {} - - int getnDevs() { return nGpus; } - - uint64_t getBusId(int dev) { return busIds[dev]; } - - ncclResult_t getPciPath(char* busId, char** path) { - char tempBusId[] = "0000:00:00.0"; - *path = (char *)malloc(PATH_MAX); - int i; - for (i = 0; i < nGpus; i++) { - NCCLCHECK(int64ToBusId(busIds[i], tempBusId)); - if (strcmp(busId, tempBusId) == 0) - break; - } - if (i < nGpus) - strcpy(*path, gpuPciPaths[i]); - else { - WARN("Could not find real path of %s", busId); - return ncclSystemError; - } - return ncclSuccess; - } - - int p2pCanConnect(int device1, int device2) { - // connection matrix are 8 GPUs - int dist = connMatrix[device1*8+device2]; - if (dist == 255) - return 0; - //if (dist%15 == 0 && dist/15 != 1) { - // return 0; - //} - return 1; - }; - - hipError_t getLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) { - // connection matrix are 8 GPUs - int dist = connMatrix[device1*8+device2]; - - if (dist%15 == 0) { - *linktype = 4; - *hopcount = dist/15; - } - else if (dist%20 == 0) { - *linktype = 2; - *hopcount = dist/20; - } - else if (dist%36 == 0) { - *linktype = 1; - *hopcount = dist/36; - } - return hipSuccess; - } - - virtual int getNumaId(char *path) { - int n; - // search for all GPUs - for (n = 0; n < nGpus; n++) - if (strcmp(path, gpuPciPaths[n]) == 0) - break; - if (n < nGpus) - return gpuNumaIds[n]; - return -1; - } -}; - -class NetDevices { -private: - int nNetDevs; - char **netPciPaths; - uint64_t *netGuids; // IB ports on same card share the same GUID - int *netNumaIds; - -public: - NetDevices(const int nnetdevs, const char **netpcipaths, const uint64_t *netguids, const int *netnumaids) : - nNetDevs(nnetdevs), netPciPaths((char **)netpcipaths), netGuids((uint64_t *)netguids), netNumaIds((int *)netnumaids) {} - - NetDevices() : nNetDevs(0), netPciPaths(0), netGuids(0), netNumaIds(0) {} - - int getnDevs() { return nNetDevs; } - - ncclResult_t getPciPath(int dev, char** path) { - *path = (char *)malloc(PATH_MAX); - if (dev < nNetDevs) - strcpy(*path, netPciPaths[dev]); - else { - WARN("Could not find real path of %d", dev); - return ncclSystemError; - } - return ncclSuccess; - } - - virtual int getNumaId(char *path) { - int n; - // search for all NICs - for (n = 0; n < nNetDevs; n++) - if (strcmp(path, netPciPaths[n]) == 0) - break; - if (n < nNetDevs) - return netNumaIds[n]; - return -1; - } - - uint64_t getIbGuid(char* path) { - int n; - for (n = 0; n < nNetDevs; n++) - if (strcmp(path, netPciPaths[n]) == 0) - break; - if (n < nNetDevs) - return netGuids[n]; - WARN("Invalid IB path %s", path); - return 0; - } -}; +#include +#include "topo.h" +#include "xml.h" +#include "utils.h" class NodeModel { private: - CpuDevices cpus; - GpuDevices gpus; - NetDevices netdevs; public: - int nodeId; - int currRank; - int firstRank; + std::vector systems; uint64_t hostHash; // auto-generated uint64_t pidHash; // auto-generated - char description[256]; + int nodeId; + int firstRank; + int currRank; - int rankToCudaDev(int rank) { return rank - firstRank; } - - int getnGpus() { return gpus.getnDevs(); } - - int getnNetDevs() { return netdevs.getnDevs(); } - - ncclResult_t getGpuPciPath(char* busId, char** path) { - return gpus.getPciPath(busId, path); + NodeModel(const char *xml_file) { + char filename[PATH_MAX]; + ssize_t count = readlink("/proc/self/exe", filename, PATH_MAX); + while (--count > 0) { + if (filename[count] == '/') { + filename[count+1] = 0; + break; + } + }; + strcat(filename, "models/"); + strcat(filename, xml_file); + struct ncclTopoSystem* system; + ncclTopoGetSystem(filename, &system); + systems.push_back(system); + for (int i=0; inodes[GPU].count; } - uint64_t getGpuBusId(int dev) { - return gpus.getBusId(dev); + int rankToCudaDev(int rank) { + for (int i=0; inodes[GPU].nodes[i].gpu.rank) + return systems[0]->nodes[GPU].nodes[i].gpu.dev; + } + return -1; } - int p2pCanConnect(int device1, int device2) { return gpus.p2pCanConnect(device1, device2); } - - hipError_t getLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) { - return gpus.getLinkTypeAndHopCount(device1, device2, linktype, hopcount); + int64_t getGpuBusId(int rank) { + for (int i=0; inodes[GPU].nodes[i].gpu.rank) + return systems[0]->nodes[GPU].nodes[i].id; + } + return -1; } - uint64_t getIbGuid(char* path) { - return netdevs.getIbGuid(path); + int busIdToCudaDev(int64_t busId) { + for (int i=0; inodes[GPU].nodes[i].id == busId) + return systems[0]->nodes[GPU].nodes[i].gpu.dev; + return -1; } + void setRanks() { + for (int r=0; rnodes[GPU].nodes[i].gpu.rank += firstRank; + } + + int p2pCanConnect(int device1, int device2) { return 1; } int shmCanConnect(int device1, int device2) { return 1; } int netCanConnect(int device1, int device2) { return 1; } - virtual int getNumaId(char *path) { - int numa = gpus.getNumaId(path); - if (numa != -1) return numa; - numa = netdevs.getNumaId(path); - if (numa != -1) return numa; - WARN("Invalid path %s for getNumaId", path); - return 0; - } - - virtual ncclResult_t getCpuWidths(char* name, int* interCpu, int* cpuPci, int* p2pPci) { - return cpus.getCpuWidths(name, interCpu, cpuPci, p2pPci); - } - - NodeModel(CpuDevices cpu, GpuDevices gpu, NetDevices net, const char *desc) : - cpus(cpu), gpus(gpu), netdevs(net) { - strncpy(description, desc, 256); - } - - NodeModel() {} - ~NodeModel() {} }; class NetworkModel { private: - int nNodes; int nRanks; - NodeModel nodes[NCCL_TOPO_MAX_NODES]; + std::vector nodes; public: - void AddNode(NodeModel node) { - nodes[nNodes] = node; - nodes[nNodes].nodeId = nNodes; - nodes[nNodes].firstRank = nRanks; - nodes[nNodes].hostHash = ((uint64_t)rand() << 32) | rand(); - nodes[nNodes].pidHash = ((uint64_t)rand() << 32) | rand(); - nNodes++; - nRanks += node.getnGpus(); + void AddNode(NodeModel* node) { + node->nodeId = nodes.size(); + node->firstRank = nRanks; + node->setRanks(); + nRanks += node->getNumGpus(); + nodes.push_back(node); } - int GetNNodes() { return nNodes; } - - int GetNRanks() { return nRanks; } - NodeModel* GetNode(int rank) { - int node_id; - - if(rank < 0 || rank >= nRanks) - return 0; - - for(node_id = nNodes-1; node_id >= 0; node_id--) - if(rank >= nodes[node_id].firstRank) break; - - if (node_id >= 0) { - nodes[node_id].currRank = rank; - return nodes+node_id; + for (auto & node : nodes) { + if (rank >= node->firstRank && rank < node->firstRank+node->getNumGpus()) { + node->currRank = rank; + return node; + } } - else - return 0; + return NULL; } - NetworkModel() : nNodes(0), nRanks(0) {} -}; + int GetNNodes() { return nodes.size(); } + int GetNRanks() { return nRanks; } - -const static uint64_t busIds_8[] = { 0x1d000, 0x20000, 0x23000, 0x26000, 0x3f000, 0x43000, 0x46000, 0x49000 }; - -const static char* gpuPciPaths_8[] = { - "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:08.0/0000:1b:00.0/0000:1c:00.0/0000:1d:00.0", - "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:0c.0/0000:1e:00.0/0000:1f:00.0/0000:20:00.0", - "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:10.0/0000:21:00.0/0000:22:00.0/0000:23:00.0", - "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:14.0/0000:24:00.0/0000:25:00.0/0000:26:00.0", - "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:04.0/0000:3d:00.0/0000:3e:00.0/0000:3f:00.0", - "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:0c.0/0000:41:00.0/0000:42:00.0/0000:43:00.0", - "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:10.0/0000:44:00.0/0000:45:00.0/0000:46:00.0", - "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:14.0/0000:47:00.0/0000:48:00.0/0000:49:00.0", -}; - -const static int gpuPciNumaIds_8[] = { 0, 0, 0, 0, 0, 0, 0, 0 }; - -const static char* netPciPaths_1[] = { - "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:04.0/0000:1a:00.0", -}; - -const static char* netPciPaths_1_1[] = { - "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:08.0/0000:4c:00.0", -}; - -const static uint64_t netGuids_1[] = { - 0xb8599f030007053aL, -}; - -const static int netPciNumaIds_1[] = { 0 }; - -const static char* netPciPaths_2[] = { - "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:04.0/0000:1a:00.0", - "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:08.0/0000:4c:00.0", -}; - -const static uint64_t netGuids_2[] = { - 0xb8599f030007053aL, - 0x506b4b030027bbf2L, -}; - -const static int netPciNumaIds_2[] = { 0, 0 }; - -const static uint64_t rome_busIds_8[] = { 0x63000, 0x23000, 0x26000, 0x03000, 0xe3000, 0xc3000, 0xc6000, 0xa3000 }; - -const static char* rome_gpuPciPaths_8[] = { - "/sys/devices/pci0000:60/0000:60:03.1/0000:61:00.0/0000:62:00.0/0000:63:00.0", - "/sys/devices/pci0000:20/0000:20:01.1/0000:21:00.0/0000:22:00.0/0000:23:00.0", - "/sys/devices/pci0000:20/0000:20:03.1/0000:24:00.0/0000:25:00.0/0000:26:00.0", - "/sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0", - "/sys/devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/0000:e2:00.0/0000:e3:00.0", - "/sys/devices/pci0000:c0/0000:c0:01.1/0000:c1:00.0/0000:c2:00.0/0000:c3:00.0", - "/sys/devices/pci0000:c0/0000:c0:03.1/0000:c4:00.0/0000:c5:00.0/0000:c6:00.0", - "/sys/devices/pci0000:a0/0000:a0:03.1/0000:a1:00.0/0000:a2:00.0/0000:a3:00.0", -}; - -const static int rome_gpuPciNumaIds_8[] = { 0, 0, 0, 0, 4, 4, 4, 4 }; - -const static char* rome_netPciPaths_1[] = { - "/sys/devices/pci0000:40/0000:40:01.1/0000:41:00.0", -}; - -const static uint64_t rome_netGuids_1[] = { - 0xb8599f030007053aL, -}; - -const static int rom_netPciNumaIds_1[] = { 0 }; - -const static char* rome_netPciPaths_2[] = { - "/sys/devices/pci0000:40/0000:40:01.1/0000:41:00.0", - "/sys/devices/pci0000:80/0000:80:01.1/0000:81:00.0", -}; - -const static uint64_t rome_netGuids_2[] = { - 0xb8599f030007053aL, - 0x506b4b030027bbf2L, -}; - -const static int rom_netPciNumaIds_2[] = { 0, 4 }; - -const int conn_mat_pcie[64] = { - 0 , 40, 40, 40, 40, 40, 40, 40, - 40, 0 , 40, 40, 40, 40, 40, 40, - 40, 40, 0 , 40, 40, 40, 40, 40, - 40, 40, 40, 0 , 40, 40, 40, 40, - 40, 40, 40, 40, 0 , 40, 40, 40, - 40, 40, 40, 40, 40, 0 , 40, 40, - 40, 40, 40, 40, 40, 40, 0 , 40, - 40, 40, 40, 40, 40, 40, 40, 0 , -}; - -const int conn_mat_4p2h[64] = { - 0 , 15, 15, 30, 40, 40, 40, 40, - 15, 0 , 30, 15, 40, 40, 40, 40, - 15, 30, 0 , 15, 40, 40, 40, 40, - 30, 15, 15, 0 , 40, 40, 40, 40, - 40, 40, 40, 40, 0 , 15, 15, 30, - 40, 40, 40, 40, 15, 0 , 30, 15, - 40, 40, 40, 40, 15, 30, 0 , 15, - 40, 40, 40, 40, 30, 15, 15, 0 , -}; - -const int conn_mat_8p6l[64] = { - 0 , 15, 15, 15, 15, 30, 15, 15, - 15, 0 , 15, 15, 30, 15, 15, 15, - 15, 15, 0 , 15, 15, 15, 15, 30, - 15, 15, 15, 0 , 15, 15, 30, 15, - 15, 30, 15, 15, 0 , 15, 15, 15, - 30, 15, 15, 15, 15, 0 , 15, 15, - 15, 15, 15, 30, 15, 15, 0 , 15, - 15, 15, 30, 15, 15, 15, 15, 0 , -}; - -const int conn_mat_8p6l_1[64] = { - 0 , 15, 15, 30, 15, 15, 15, 15, - 15, 0 , 30, 15, 15, 15, 15, 15, - 15, 30, 0 , 15, 15, 15, 15, 15, - 30, 15, 15, 0 , 15, 15, 15, 15, - 15, 15, 15, 15, 0 , 15, 15, 30, - 15, 15, 15, 15, 15, 0 , 30, 15, - 15, 15, 15, 15, 15, 30, 0 , 15, - 15, 15, 15, 15, 30, 15, 15, 0 , -}; - -const int conn_mat_rome[64] = { - 0 , 40, 40, 40, 72, 72, 72, 72, - 40, 0 , 40, 40, 72, 72, 72, 72, - 40, 40, 0 , 40, 72, 72, 72, 72, - 40, 40, 40, 0 , 72, 72, 72, 72, - 72, 72, 72, 72, 0 , 40, 40, 40, - 72, 72, 72, 72, 40, 0 , 40, 40, - 72, 72, 72, 72, 40, 40, 0 , 40, - 72, 72, 72, 72, 40, 40, 40, 0 , + NetworkModel() : nRanks(0) {} }; #endif \ No newline at end of file diff --git a/tools/topo_expl/include/nccl.h b/tools/topo_expl/include/nccl.h index 23ffec5283..225b150127 100644 --- a/tools/topo_expl/include/nccl.h +++ b/tools/topo_expl/include/nccl.h @@ -12,11 +12,11 @@ #include #define NCCL_MAJOR 2 -#define NCCL_MINOR 5 -#define NCCL_PATCH 6 +#define NCCL_MINOR 6 +#define NCCL_PATCH 2 #define NCCL_SUFFIX "" -#define NCCL_VERSION_CODE 2506 +#define NCCL_VERSION_CODE 2602 #define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z)) #define RCCL_BFLOAT16 1 diff --git a/tools/topo_expl/include/utils.h b/tools/topo_expl/include/utils.h index 26e85c26d6..9337e18aee 100644 --- a/tools/topo_expl/include/utils.h +++ b/tools/topo_expl/include/utils.h @@ -13,32 +13,34 @@ struct allGather1Data_t { struct ncclComm* comm; }; -struct allGather3Data_t { +// AllGather3 - begin +struct ncclGraphInfo { + int sameChannels; + float speedIntra; + float speedInter; + int typeIntra; +}; + +struct allGather3Data_t{ int cudaCompCap; int fullCudaCompCap; - int nvlink; int nChannels; - struct { - int sameChannels; - int speedIntra; - int speedInter; - int nvlink; - } tree; - struct { - int sameChannels; - int speedIntra; - int speedInter; - int nvlink; - } ring; + struct ncclGraphInfo tree; + struct ncclGraphInfo ring; + struct ncclGraphInfo collNet; struct ncclTopoRanks topoRanks; }; +ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem** system); + +ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem); + ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data); -ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, - struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph); +ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, struct allGather3Data_t *allGather3Data, + struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph); ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, - struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph); + struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph); #endif \ No newline at end of file diff --git a/tools/topo_expl/model.cpp b/tools/topo_expl/model.cpp index 9492526dc9..db5eb881bf 100644 --- a/tools/topo_expl/model.cpp +++ b/tools/topo_expl/model.cpp @@ -41,24 +41,14 @@ THE SOFTWARE. #include #include #include "model.h" +#include "topo.h" extern NodeModel *node_model; -static ncclResult_t dummyNetDevices(int* ndev) { - *ndev = node_model->getnNetDevs(); - return ncclSuccess; -} - -static ncclResult_t dummyNetPciPath(int dev, char** path) { - node_model->getNetPciPath(dev, path); - return ncclSuccess; -} - ncclNet_t ncclNetDummy = { "IB", 0, - dummyNetDevices, - dummyNetPciPath, + 0, 0, 0, 0, @@ -76,24 +66,9 @@ ncclNet_t ncclNetDummy = { ncclNet_t* ncclNet = &ncclNetDummy; -ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { - return ncclSuccess; -} - /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ int busIdToCudaDev(int64_t busId) { - int cudaDev; - - for (cudaDev = 0; cudaDev < node_model->getnGpus(); cudaDev++) { - if (node_model->getGpuBusId(cudaDev) == busId) - break; - } - - if (cudaDev < node_model->getnGpus()) - return cudaDev; - else - WARN("Invalid busId %lx", busId); - return 0; + return node_model->busIdToCudaDev(busId); } /* Determine if two peers can communicate with P2P */ @@ -177,6 +152,8 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra int netDev, useGdr = 0; NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &netDev)); + NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, netDev, 1, &useGdr)); + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), netDev, useGdr ? "/GDRDMA" : ""); return ncclSuccess; @@ -188,15 +165,8 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra int netDev, useGdr = 0; NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &netDev)); - // Check if we are close enough that it makes sense to enable GDR - int netGdrLevel = ncclParamNetGdrLevel(); - int distance; - NCCLCHECK(ncclTopoNetDistance(topo, myInfo->busId, netDev, &distance)); - if (distance >= netGdrLevel) { - INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), myInfo->busId, netDev, distance, netGdrLevel); - } - else - useGdr = 1; + NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, netDev, 0, &useGdr)); + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), netDev, useGdr ? "/GDRDMA" : ""); return ncclSuccess; diff --git a/tools/topo_expl/models/topo_4p1h.xml b/tools/topo_expl/models/topo_4p1h.xml new file mode 100644 index 0000000000..6878edf115 --- /dev/null +++ b/tools/topo_expl/models/topo_4p1h.xml @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/topo_expl/models/topo_4p1h_1.xml b/tools/topo_expl/models/topo_4p1h_1.xml new file mode 100644 index 0000000000..bc7dcaa606 --- /dev/null +++ b/tools/topo_expl/models/topo_4p1h_1.xml @@ -0,0 +1,46 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/topo_expl/models/topo_4p2h.xml b/tools/topo_expl/models/topo_4p2h.xml new file mode 100644 index 0000000000..03ce1dff5c --- /dev/null +++ b/tools/topo_expl/models/topo_4p2h.xml @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/topo_expl/models/topo_4p2h_1.xml b/tools/topo_expl/models/topo_4p2h_1.xml new file mode 100644 index 0000000000..fb2dea7d9b --- /dev/null +++ b/tools/topo_expl/models/topo_4p2h_1.xml @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/topo_expl/models/topo_4p2h_2nic.xml b/tools/topo_expl/models/topo_4p2h_2nic.xml new file mode 100644 index 0000000000..9ec580f726 --- /dev/null +++ b/tools/topo_expl/models/topo_4p2h_2nic.xml @@ -0,0 +1,82 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/topo_expl/models/topo_4p3l.xml b/tools/topo_expl/models/topo_4p3l.xml new file mode 100644 index 0000000000..0549d11388 --- /dev/null +++ b/tools/topo_expl/models/topo_4p3l.xml @@ -0,0 +1,52 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/topo_expl/models/topo_8p6l.xml b/tools/topo_expl/models/topo_8p6l.xml new file mode 100644 index 0000000000..e54ac50a1b --- /dev/null +++ b/tools/topo_expl/models/topo_8p6l.xml @@ -0,0 +1,107 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/topo_expl/models/topo_8p_pcie.xml b/tools/topo_expl/models/topo_8p_pcie.xml new file mode 100644 index 0000000000..521d1ab695 --- /dev/null +++ b/tools/topo_expl/models/topo_8p_pcie.xml @@ -0,0 +1,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/topo_expl/models/topo_8p_pcie_1.xml b/tools/topo_expl/models/topo_8p_pcie_1.xml new file mode 100644 index 0000000000..350f11f816 --- /dev/null +++ b/tools/topo_expl/models/topo_8p_pcie_1.xml @@ -0,0 +1,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/topo_expl/models/topo_8p_pcie_2nic.xml b/tools/topo_expl/models/topo_8p_pcie_2nic.xml new file mode 100644 index 0000000000..dae7b1c0f1 --- /dev/null +++ b/tools/topo_expl/models/topo_8p_pcie_2nic.xml @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/topo_expl/topo_expl.cpp b/tools/topo_expl/topo_expl.cpp index 98f175cccc..4f2df2ef4e 100644 --- a/tools/topo_expl/topo_expl.cpp +++ b/tools/topo_expl/topo_expl.cpp @@ -29,7 +29,6 @@ THE SOFTWARE. #include "net.h" #include "graph.h" #include "argcheck.h" -#include "cpuset.h" #include #include #include @@ -65,18 +64,18 @@ bool cmdOptionExists(char** begin, char** end, const std::string& option) { const char *model_descriptions[] = { "single node VEGA20 4P1H", + "single node VEGA20 4P1H Alt. Model", "single node VEGA20 4P2H", "single node gfx908 4P3L", "single node gfx908 8P6L", - "single node gfx908 8P6L Alt. Connection", - "single node 8 VEGA20 PCIe on Rome", - "single node gfx908 8P6L on Rome", + "single node 8 VEGA20 PCIe", "4 nodes with 8 GPUs PCIe 1 NIC", + "4 nodes with 8 GPUs PCIe 1 NIC 2nd PLX Bridge", "4 nodes with 8 GPUs PCIe 2 NIC", "2 nodes VEGA20 4P1H", "4 nodes with 8 VEGA20 GPUs XGMI 4P2H 1 NIC", - "4 nodes 8 GPUs PCIe 2 NICs on Rome", - "3 nodes 8 GPUs PCIe + 1 Rome 8 GPUs PCIe + 2 nodes gfx908 4P3L", + "4 nodes with 8 VEGA20 GPUs XGMI 4P2H 1 NIC 2nd Hive", + "4 nodes with 8 VEGA20 GPUs XGMI 4P2H 2 NIC", NULL, }; @@ -97,97 +96,75 @@ int main(int argc,char* argv[]) if (mi) model_id = atol(mi); - // CPU, GPU and NIC devices on Skylake - CpuDevices skylake("Skylake", SKL_QPI_WIDTH, SKL_CPUPCI_WIDTH, SKL_PCI_WIDTH); - GpuDevices vg20_pcie(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_pcie); - GpuDevices vg20_4p1h(4, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_4p2h); - GpuDevices vg20_4p2h(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_4p2h); - GpuDevices gfx908_4p3l(4, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l); - GpuDevices gfx908_8p6l(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l); - GpuDevices gfx908_8p6l_1(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l_1); - NetDevices nic_1(1, netPciPaths_1, netGuids_1, netPciNumaIds_1); - NetDevices nic_1_1(1, netPciPaths_1_1, netGuids_1, netPciNumaIds_1); - NetDevices nic_2(2, netPciPaths_2, netGuids_2, netPciNumaIds_2); - - // CPU, GPU and NIC devices on Rome - CpuDevices rome("Rome", ROME_QPI_WIDTH, ROME_CPUPCI_WIDTH, ROME_PCI_WIDTH); - GpuDevices vg20_pcie_rome(8, rome_busIds_8, rome_gpuPciPaths_8, rome_gpuPciNumaIds_8, conn_mat_rome); - NetDevices nic_1_rome(1, rome_netPciPaths_1, rome_netGuids_1, rom_netPciNumaIds_1); - NetDevices nic_2_rome(2, rome_netPciPaths_2, rome_netGuids_2, rom_netPciNumaIds_2); - - // 8 GPUs PCIe 1 NIC - NodeModel model_8pcie_1nic(skylake, vg20_pcie, nic_1, "Skylake 8 GPUs PCIe"); - - // 8 GPUs PCIe 2 NIC - NodeModel model_8pcie_2nic(skylake, vg20_pcie, nic_2, "Skylake 8 GPUs PCIe 2 NIC"); - - // VEGA20 4P1H, use VEGA20 4P2H model - NodeModel model_vg20_4p1h_1nic(skylake, vg20_4p1h, nic_1, "Skylake VEGA20 4P1H"); - - // VEGA20 GPUs XGMI 4P2H 1 NIC - NodeModel model_vg20_4p2h_1nic(skylake, vg20_4p2h, nic_1_1, "Skylake VEGA20 4P2H"); - - // gfx908 4P3L - NodeModel model_gfx908_4p_1nic(skylake, gfx908_4p3l, nic_1, "Skylake gfx908 4P3L"); - - // gfx908 8P6L - NodeModel model_gfx908_8p_1nic(skylake, gfx908_8p6l, nic_1, "Skylake gfx908 8P6L"); - - // gfx908 8P6L alternative connection - NodeModel model_gfx908_8p_1nic_1(skylake, gfx908_8p6l_1, nic_1, "Skylake gfx908 8P6L Alt. Connection"); - - // 8 GPUs PCIe on Rome - NodeModel model_8pcie_1nic_rome(rome, vg20_pcie_rome, nic_1_rome, "Rome 8 GPUs PCIe"); - - // 8 GPUs PCIe 2 NICs on Rome - NodeModel model_8pcie_2nic_rome(rome, vg20_pcie_rome, nic_2_rome, "Rome 8 GPUs PCIe 2 NICs"); - - // gfx908 8P6L on Rome - NodeModel model_gfx908_8p_1nic_rome(rome, gfx908_8p6l, nic_1, "Rome gfx908 8P6L"); - NetworkModel network; + NodeModel* node; switch(model_id) { case 0: - network.AddNode(model_vg20_4p1h_1nic); + node = new NodeModel("topo_4p1h.xml"); + network.AddNode(node); break; case 1: - network.AddNode(model_vg20_4p2h_1nic); + node = new NodeModel("topo_4p1h_1.xml"); + network.AddNode(node); break; case 2: - network.AddNode(model_gfx908_4p_1nic); + node = new NodeModel("topo_4p2h.xml"); + network.AddNode(node); break; case 3: - network.AddNode(model_gfx908_8p_1nic); + node = new NodeModel("topo_4p3l.xml"); + network.AddNode(node); break; case 4: - network.AddNode(model_gfx908_8p_1nic_1); + node = new NodeModel("topo_8p6l.xml"); + network.AddNode(node); break; case 5: - network.AddNode(model_8pcie_1nic_rome); + node = new NodeModel("topo_8p_pcie.xml"); + network.AddNode(node); break; case 6: - network.AddNode(model_gfx908_8p_1nic_rome); + for (int i=0; i<4; i++) { + node = new NodeModel("topo_8p_pcie.xml"); + network.AddNode(node); + } break; case 7: - for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_1nic); + for (int i=0; i<4; i++) { + node = new NodeModel("topo_8p_pcie_1.xml"); + network.AddNode(node); + } break; case 8: - for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_2nic); + for (int i=0; i<4; i++) { + node = new NodeModel("topo_8p_pcie_2nic.xml"); + network.AddNode(node); + } break; case 9: - for (int i = 0; i < 2; i ++) network.AddNode(model_vg20_4p1h_1nic); + for (int i=0; i<2; i++) { + node = new NodeModel("topo_4p1h.xml"); + network.AddNode(node); + } break; case 10: - for (int i = 0; i < 4; i ++) network.AddNode(model_vg20_4p2h_1nic); + for (int i=0; i<4; i++) { + node = new NodeModel("topo_4p2h.xml"); + network.AddNode(node); + } break; case 11: - for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_2nic_rome); + for (int i=0; i<4; i++) { + node = new NodeModel("topo_4p2h_1.xml"); + network.AddNode(node); + } break; case 12: - for (int i = 0; i < 3; i ++) network.AddNode(model_8pcie_1nic); - network.AddNode(model_8pcie_1nic_rome); - for (int i = 0; i < 2; i ++) network.AddNode(model_gfx908_4p_1nic); + for (int i=0; i<4; i++) { + node = new NodeModel("topo_4p2h_2nic.xml"); + network.AddNode(node); + } break; default: printf("Invalid model_id %d\n", model_id); @@ -203,8 +180,8 @@ int main(int argc,char* argv[]) for (int i = 0; i < nranks; i++) { node_model = network.GetNode(i); assert(node_model!=0); - printf("Rank %d: node %d (%s) GPU busId %lx\n", i, node_model->nodeId, - node_model->description, node_model->getGpuBusId(node_model->rankToCudaDev(i))); + printf("Rank %d: node %d cudaDev %d GPU busId %lx\n", i, node_model->nodeId, + node_model->rankToCudaDev(i), node_model->getGpuBusId(i)); } NCCLCHECK(ncclCalloc(&comm, nranks)); @@ -220,21 +197,22 @@ int main(int argc,char* argv[]) comm[i].nRanks = nranks; node_model = network.GetNode(i); assert(node_model!=0); + comm[i].topo = node_model->getSystem(i); bootstrapAllGather(&comm[i], allGather1Data); } - struct ncclTopoGraph treeGraph, ringGraph; + struct ncclTopoGraph treeGraph, ringGraph, collNetGraph; for (int i = 0; i < nranks; i++) { node_model = network.GetNode(i); assert(node_model!=0); - initTransportsRank_1(&comm[i], allGather1Data, allGather3Data, treeGraph, ringGraph); + initTransportsRank_1(&comm[i], allGather1Data, allGather3Data, treeGraph, ringGraph, collNetGraph); } for (int i = 0; i < nranks; i++) { node_model = network.GetNode(i); assert(node_model!=0); - initTransportsRank_3(&comm[i], allGather3Data, treeGraph, ringGraph); + initTransportsRank_3(&comm[i], allGather3Data, treeGraph, ringGraph, collNetGraph); } free(allGather3Data); diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp index 18d61d0b99..942b4a8a66 100644 --- a/tools/topo_expl/utils.cpp +++ b/tools/topo_expl/utils.cpp @@ -25,12 +25,19 @@ #include #include #include +#include "xml.h" +#include "coll_net.h" #include "model.h" #include "utils.h" extern NodeModel *node_model; NCCL_PARAM(CrossNic, "CROSS_NIC", 2); +NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0); +NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0); + +thread_local int ncclDebugNoWarn = 0; +ncclCollNet_t* ncclCollNet = NULL; // Get current Compute Capability int ncclCudaCompCap() { @@ -43,7 +50,7 @@ ncclResult_t int64ToBusId(int64_t id, char* busId) { return ncclSuccess; } -ncclResult_t busIdToInt64(char* busId, int64_t* id) { +ncclResult_t busIdToInt64(const char* busId, int64_t* id) { const int size = strlen(busId); char* hexStr; NCCLCHECK(ncclCalloc(&hexStr, size)); @@ -87,9 +94,9 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file if (ncclDebugLevel == -1) ncclDebugInit(); if (level == NCCL_LOG_TRACE && ncclDebugLevel != NCCL_LOG_TRACE) return; char buffer[1024]; - size_t len; - len = snprintf(buffer, sizeof(buffer), - "[%d:%d] ", node_model->nodeId, node_model->currRank); + size_t len = 0; + if (node_model) len = snprintf(buffer, sizeof(buffer), + "[%d:%d] ", node_model->nodeId, node_model->currRank); va_list args; va_start(args, fmt); vsprintf(buffer+len, fmt, args); @@ -102,6 +109,16 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file } } +ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem** system) { + struct ncclXml* xml; + NCCLCHECK(ncclCalloc(&xml, 1)); + NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml)); + NCCLCHECK(ncclTopoGetSystemFromXml(xml, system)); + free(xml); + return ncclSuccess; +} + + ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data) { // AllGather1 - begin allGather1Data[comm->rank].peerInfo.rank = comm->rank; @@ -110,12 +127,12 @@ ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data[comm->rank].peerInfo.hostHash = node_model->hostHash; allGather1Data[comm->rank].peerInfo.pidHash = node_model->pidHash; allGather1Data[comm->rank].peerInfo.shmDev = 0x19; - allGather1Data[comm->rank].peerInfo.busId = node_model->getGpuBusId(node_model->rankToCudaDev(comm->rank)); + allGather1Data[comm->rank].peerInfo.busId = node_model->getGpuBusId(comm->rank); return ncclSuccess; } -ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, - struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph) { +ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, struct allGather3Data_t *allGather3Data, + struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) { // We use 3 AllGathers // 1. { peerInfo, comm } // 2. ConnectTransport[nranks], ConnectValue[nranks] @@ -147,45 +164,70 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t // AllGather1 - end // Topo detection / System graph creation - NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo)); + //NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo)); // Compute paths between GPUs and NICs NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); // Remove inaccessible GPUs and unused NICs NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm)); // Recompute paths after trimming NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); - // Compute max speed to accelerate search - NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo)); + // Init search + NCCLCHECK(ncclTopoSearchInit(comm->topo)); // Print final topology NCCLCHECK(ncclTopoPrint(comm->topo)); // Get rings and trees - //struct ncclTopoGraph treeGraph; - treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE; - treeGraph.crossNic = ncclParamCrossNic(); - // We communicate only half the data between node with trees on 2 nodes. - NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph)); - NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph)); //struct ncclTopoGraph ringGraph; + ringGraph.id = 0; ringGraph.pattern = NCCL_TOPO_PATTERN_RING; ringGraph.crossNic = ncclParamCrossNic(); + ringGraph.collNet = 0; + ringGraph.minChannels = 1; + ringGraph.maxChannels = MAXCHANNELS/2; NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph)); + //struct ncclTopoGraph treeGraph; + treeGraph.id = 1; + treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE; + treeGraph.crossNic = ncclParamCrossNic(); + treeGraph.collNet = 0; + treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels; + treeGraph.maxChannels = ringGraph.nChannels; + NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph)); + NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph)); + + //struct ncclTopoGraph collNetGraph; + collNetGraph.id = 2; + collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; + collNetGraph.collNet = 1; + collNetGraph.crossNic = ncclParamCrossNic(); + collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels; + NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph)); + NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph)); + + if (comm->rank == ncclParamGraphDumpFileRank()) { + struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph }; + NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs)); + } + // AllGather3 - begin allGather3Data[rank].cudaCompCap = ncclCudaCompCap(); - allGather3Data[rank].nvlink = treeGraph.nvlink; allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra; allGather3Data[rank].tree.speedInter = treeGraph.speedInter; - allGather3Data[rank].tree.nvlink = treeGraph.nvlink; + allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra; allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels; allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra; allGather3Data[rank].ring.speedInter = ringGraph.speedInter; - allGather3Data[rank].ring.nvlink = ringGraph.nvlink; + allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra; + allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels; + allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra; + allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter; + allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra; - NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks)); + NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks)); //INFO(NCCL_GRAPH, "%d: nvlink %d nChannels %d tree.sameChannels %d tree.speedIntra %d tree.speedInter %d tree.nvlink %d ring.sameChannels %d ring.speedIntra %d ring.speedInter %d ring.nvlink %d", // rank, allGather3Data[rank].nvlink, allGather3Data[rank].nChannels, allGather3Data[rank].tree.sameChannels, allGather3Data[rank].tree.speedIntra, allGather3Data[rank].tree.speedInter, allGather3Data[rank].tree.nvlink, // allGather3Data[rank].ring.sameChannels, allGather3Data[rank].ring.speedIntra, allGather3Data[rank].ring.speedInter, allGather3Data[rank].ring.nvlink); @@ -203,14 +245,8 @@ static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopo int ret = 0; NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo)); if (ret) { - //cpu_set_t affinitySave; - //sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - //int cudaDev; - //CUDACHECK(hipGetDevice(&cudaDev)); - //setCpuAffinity(cudaDev); connector->transportComm = transportComm; NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId)); - //sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); return ncclSuccess; } } @@ -265,21 +301,26 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, return ncclSuccess; } + ncclResult_t initChannel(struct ncclComm* comm, int channelid) { struct ncclChannel* channel = comm->channels+channelid; channel->id = channelid; // Setup intermediate buffering - //channel->buffSize = ncclParamBuffsize(); + //int buffSize = ncclParamBuffsize(); + int cpuArch, cpuVendor, cpuModel; + NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); + //channel->buffSize = buffSize != -2 ? buffSize : + // cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES; // Ring index to user rank table. //NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks)); NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks)); // Communication structures with peers. - //NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks)); - NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks)); - for (size_t i=0; inRanks; ++i) { + //NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network) + NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1)); + for (size_t i=0; inRanks+1; ++i) { channel->peers[i].send.comm = comm; channel->peers[i].recv.comm = comm; } @@ -307,8 +348,8 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, return ncclSuccess; } -ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, - struct ncclTopoGraph& ringGraph) { +ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, + struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) { int rank = comm->rank; int nranks = comm->nRanks; //NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data))); @@ -329,6 +370,15 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t if (i == comm->rank) comm->node = node; } + char line[1024]; + sprintf(line, "nodesFirstRank: "); + int offset = strlen(line); + for (int i=0; inNodes; i++) { + sprintf(line+offset, "%d ", nodesFirstRank[i]); + offset = strlen(line); + } + INFO(NCCL_INIT, "%s", line); + // Determine the minimum CUDA Compute capability of all GPUs int myCompCap = allGather3Data[rank].cudaCompCap; int minCompCap = myCompCap, maxCompCap = myCompCap; @@ -337,9 +387,6 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap); } - comm->nvlink = 1; - for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink; - int nChannelsOrig = comm->nChannels; struct ncclTopoRanks** allTopoRanks; NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks)); @@ -350,11 +397,15 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra); treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter); - treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink); + treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra); ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter); - ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink); + ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); + collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels); + collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra); + collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter); + collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); } if (comm->nChannels < nChannelsOrig) { @@ -366,24 +417,23 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t int *rings; NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); - char line[1024]; - sprintf(line, "nodesFirstRank: "); - int offset = strlen(line); - for (int i=0; inNodes; i++) { - sprintf(line+offset, "%d ", nodesFirstRank[i]); - offset = strlen(line); - } - INFO(NCCL_INIT, "%s", line); - NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings)); + if (comm->nNodes > 1 && + ncclParamCollNetEnable() == 1 && + collNetSupport()) { + NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank)); + } free(allTopoRanks); free(nodesFirstRank); + //free(allGather3Data); // AllGather3 - end TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels); + NCCLCHECK(ncclTopoSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph)); + line[0]='\0'; for (int c=0; cnChannels; c++) { struct ncclTree* treeUp = &comm->channels[c].treeUp; @@ -395,26 +445,56 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t line[1023] = '\0'; INFO(NCCL_INIT, "Trees%s", line); - free(rings); - - // Done with AllGather1 data - //free(allGather1Data); - - TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); + // Set Affinity to a CPU local the our GPU, so that all memory we allocate + // on the host is local. + cpu_set_t affinitySave; + sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); + NCCLCHECK(ncclTopoSetAffinity(comm->topo, comm->rank)); + ncclResult_t ret; // Connect with prev/next for each ring struct ncclConnect *connect; - NCCLCHECK(ncclCalloc(&connect, 2)); + NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore); for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; - NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks)); + NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore); if (comm->nRanks == 1) continue; - NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next)); - NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up)); - NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down)); + NCCLCHECKGOTO(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore); + NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore); + NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore); } + + // Check if we can setup CollNet +#if 0 + if (comm->nNodes > 1 && + ncclParamCollNetEnable() == 1 && + collNetSupport()) { + int logicChannels = comm->nChannels/2; + int collNetSetupFail = 0; + const int recvIndex = 0; // recv GPU index is always 0 + const int sendIndex = collNetGraph.pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern + for (int c=0; cchannels+logicChannels+c; + struct ncclChannel* channelSend = comm->channels+c; + NCCLCHECK(p2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down)); + NCCLCHECK(p2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up)); + const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex]; + const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex]; + if (collNetSetup(comm, &collNetGraph, channelRecv, logicChannels, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1) + collNetSetupFail = 1; + if (collNetSetup(comm, &collNetGraph, channelSend, logicChannels, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1) + collNetSetupFail = 1; + } + // Verify CollNet setup across ranks + NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail)); + } +#endif TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); free(connect); + free(rings); + +affinity_restore: + if (ret != ncclSuccess) return ret; return ncclSuccess; }