Merge remote-tracking branch 'nccl/master' into develop
Tento commit je obsažen v:
@@ -90,6 +90,7 @@ set(CU_SOURCES
|
||||
src/collectives/device/reduce.cu
|
||||
src/collectives/device/broadcast.cu
|
||||
src/collectives/device/reduce_scatter.cu
|
||||
src/collectives/device/sendrecv.cu
|
||||
src/collectives/device/functions.cu)
|
||||
|
||||
set(CPP_SOURCES)
|
||||
@@ -117,6 +118,7 @@ set(CC_SOURCES
|
||||
src/collectives/reduce_api.cc
|
||||
src/collectives/broadcast_api.cc
|
||||
src/collectives/reduce_scatter_api.cc
|
||||
src/collectives/sendrecv_api.cc
|
||||
src/channel.cc
|
||||
src/misc/argcheck.cc
|
||||
src/misc/nvmlwrap_stub.cc
|
||||
@@ -133,6 +135,7 @@ set(CC_SOURCES
|
||||
src/debug.cc
|
||||
src/group.cc
|
||||
src/bootstrap.cc
|
||||
src/proxy.cc
|
||||
src/enqueue.cc)
|
||||
|
||||
foreach(filename ${CC_SOURCES})
|
||||
|
||||
@@ -23,19 +23,24 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
|
||||
#$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
|
||||
|
||||
|
||||
# Better define NVCC_GENCODE in your environment to the minimal set
|
||||
# You should define NVCC_GENCODE in your environment to the minimal set
|
||||
# of archs to reduce compile time.
|
||||
CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
|
||||
-gencode=arch=compute_50,code=sm_50 \
|
||||
-gencode=arch=compute_60,code=sm_60 \
|
||||
-gencode=arch=compute_61,code=sm_61
|
||||
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
|
||||
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
|
||||
|
||||
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
|
||||
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
|
||||
CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
|
||||
|
||||
# Include Ampere support if we're using CUDA11 or above
|
||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
|
||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) $(CUDA11_GENCODE) $(CUDA11_PTX)
|
||||
# Include Volta support if we're using CUDA9 or above
|
||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0)
|
||||
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0)
|
||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
|
||||
else
|
||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 6
|
||||
NCCL_PATCH := 4
|
||||
NCCL_MINOR := 7
|
||||
NCCL_PATCH := 3
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
+2
-2
@@ -9,10 +9,10 @@ include ../makefiles/version.mk
|
||||
|
||||
##### src files
|
||||
INCEXPORTS := nccl.h nccl_net.h
|
||||
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
|
||||
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc \
|
||||
misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
|
||||
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
|
||||
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
|
||||
collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
|
||||
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
|
||||
|
||||
##### lib files
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -240,6 +240,7 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
|
||||
|
||||
char* env = getenv("NCCL_COMM_ID");
|
||||
if (env) {
|
||||
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
|
||||
if (bootstrapNetCreateHandle(netHandle, env) != 0) {
|
||||
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||
return ncclInvalidArgument;
|
||||
|
||||
+4
-15
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -7,24 +7,12 @@
|
||||
|
||||
#include "channel.h"
|
||||
#include "param.h"
|
||||
#include "graph.h"
|
||||
|
||||
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
|
||||
#define DEFAULT_BUFFER_SIZE_BYTES_ARM (1LL << 20) /* 1MiB */
|
||||
|
||||
NCCL_PARAM(Buffsize, "BUFFSIZE", -2);
|
||||
|
||||
ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
|
||||
struct ncclChannel* channel = comm->channels+channelid;
|
||||
if (channel->id != -1) return ncclSuccess;
|
||||
channel->id = channelid;
|
||||
|
||||
// Setup intermediate buffering
|
||||
int buffSize = ncclParamBuffsize();
|
||||
int cpuArch, cpuVendor, cpuModel;
|
||||
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
|
||||
channel->buffSize = buffSize != -2 ? buffSize :
|
||||
cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES;
|
||||
|
||||
// Ring index to user rank table.
|
||||
NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
|
||||
@@ -38,11 +26,12 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
|
||||
}
|
||||
|
||||
// Per-channel operation list.
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&channel->collectives, NCCL_MAX_OPS));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
|
||||
if (channel->id == -1) return ncclSuccess;
|
||||
// Operation list
|
||||
NCCLCHECK(ncclCudaHostFree(channel->collectives));
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
@@ -10,7 +10,7 @@ include ../../../makefiles/version.mk
|
||||
BUILDDIR ?= $(abspath ../../../build)
|
||||
OBJDIR := $(BUILDDIR)/obj/collectives/device
|
||||
|
||||
LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
|
||||
LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu
|
||||
|
||||
LIBSRCFILES += functions.cu
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -13,26 +13,27 @@ template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
const int nranks = comm->nRanks;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
|
||||
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
|
||||
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
|
||||
|
||||
@@ -80,27 +81,27 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->lastChunkSize;
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
@@ -148,29 +149,28 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -13,16 +13,17 @@ template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
const int nranks = comm->nRanks;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
|
||||
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t clk, t0 = 0ULL, ws, wr;
|
||||
@@ -30,14 +31,14 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
|
||||
#endif
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
|
||||
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
|
||||
ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
|
||||
|
||||
@@ -106,29 +107,30 @@ template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
const ssize_t size = args->N;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
int chunkSize = args->lastChunkSize;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
int chunkSize = args->coll.lastChunkSize;
|
||||
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
do {
|
||||
struct ncclTree* tree = &channel->treeUp;
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclPrimitivesRecvData<T, NCCL_MAX_TREE_ARITY> recvData;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount, recvData);
|
||||
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, 0, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount, recvData);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
@@ -147,17 +149,17 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
|
||||
struct ncclTree* tree = &channel->treeDn;
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclPrimitivesSendData<T, NCCL_MAX_TREE_ARITY> sendData;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount, sendData);
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, &tree->up, tree->down, thisOutput, stepSize, channel, comm, args->opCount, sendData);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
prims.send(thisOutput+offset, nelem);
|
||||
prims.directSend(thisOutput+offset, offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
prims.recv(thisOutput+offset, nelem);
|
||||
prims.directRecv(thisOutput+offset, offset, nelem);
|
||||
} else {
|
||||
prims.recvCopySend(thisOutput+offset, nelem);
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
} while(0);
|
||||
@@ -167,27 +169,28 @@ template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
const ssize_t size = args->N;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
int chunkSize = args->lastChunkSize;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
int chunkSize = args->coll.lastChunkSize;
|
||||
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
|
||||
if (blockIdx.x < nChannels) { // first half of the channels do reduce
|
||||
struct ncclTree* tree = &channel->collTreeUp;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
@@ -202,9 +205,9 @@ __device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
|
||||
}
|
||||
}
|
||||
|
||||
if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
|
||||
if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
|
||||
struct ncclTree* tree = &channel->collTreeDn;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
@@ -224,28 +227,27 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*nranks*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
const ssize_t loopSize = args->nChannels*nranks*chunkSize;
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
@@ -254,7 +256,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
|
||||
// step 0: push data to next GPU
|
||||
chunk = ring->devUserRanks[nranks-1];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
@@ -262,7 +264,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
@@ -271,7 +273,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
chunk = ring->devUserRanks[0];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
@@ -279,7 +281,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
@@ -287,7 +289,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
chunk = ring->devUserRanks[1];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
// Here we need to copy from buffer to this output.
|
||||
@@ -299,28 +301,30 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
const ssize_t size = args->N;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
do {
|
||||
struct ncclTree* tree = &channel->treeUp;
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclLLPrimitivesRecvData<T, NCCL_MAX_TREE_ARITY> recvData;
|
||||
ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount, recvData);
|
||||
ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm, args->opCount, recvData);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
@@ -339,7 +343,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
|
||||
struct ncclTree* tree = &channel->treeDn;
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclLLPrimitivesSendData<T, NCCL_MAX_TREE_ARITY> sendData;
|
||||
ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount, sendData);
|
||||
ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm, args->opCount, sendData);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
@@ -359,26 +363,28 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
const ssize_t size = args->N;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
|
||||
if (blockIdx.x < nChannels) { // first half of the channels do reduce
|
||||
struct ncclTree* tree = &channel->collTreeUp;
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
@@ -393,9 +399,9 @@ __device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
|
||||
}
|
||||
}
|
||||
|
||||
if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
|
||||
if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
|
||||
struct ncclTree* tree = &channel->collTreeDn;
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
@@ -416,29 +422,28 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*nranks*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
const ssize_t loopSize = args->nChannels*nranks*chunkSize;
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
@@ -447,7 +452,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
|
||||
// step 0: push data to next GPU
|
||||
chunk = ring->devUserRanks[nranks-1];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
@@ -455,7 +460,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
@@ -464,7 +469,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
chunk = ring->devUserRanks[0];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
@@ -472,7 +477,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
chunk = ring->devUserRanks[nranks-j];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
@@ -480,7 +485,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
chunk = ring->devUserRanks[1];
|
||||
offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
|
||||
offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
// Here we need to copy from buffer to this output.
|
||||
@@ -492,29 +497,31 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* treeUp = &channel->treeUp;
|
||||
struct ncclTree* treeDn = &channel->treeDn;
|
||||
const ssize_t size = args->N;
|
||||
ssize_t chunkSize = args->lastChunkSize;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = args->coll.lastChunkSize;
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
if (loopSize > size) {
|
||||
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
|
||||
chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
|
||||
}
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
if (treeUp->up == -1) {
|
||||
// ReduceAndBroadcast : max number of recv is 3, max number of send is 3
|
||||
ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, channel, comm, args->opCount);
|
||||
ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, stepSize, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
@@ -523,7 +530,7 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
|
||||
} else {
|
||||
if (tid < nthreadsSplit) {
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, channel, comm, args->opCount);
|
||||
ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, stepSize, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
@@ -536,7 +543,7 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
|
||||
}
|
||||
} else {
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, channel, comm, args->opCount);
|
||||
ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, stepSize, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -13,18 +13,19 @@ template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
|
||||
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->root;
|
||||
const int root = args->coll.root;
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t clk, t0 = 0ULL, ws, wr;
|
||||
@@ -32,14 +33,14 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
|
||||
#endif
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
@@ -81,29 +82,29 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->root;
|
||||
const int root = args->coll.root;
|
||||
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->lastChunkSize;
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
@@ -135,30 +136,29 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->root;
|
||||
const int root = args->coll.root;
|
||||
|
||||
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -95,7 +95,8 @@ static inline __device__ void exitIfAbortBarrier(int abort) {
|
||||
NCCL_FUNCS2A(ncclReduce), \
|
||||
NCCL_FUNCS2B(ncclAllGather), \
|
||||
NCCL_FUNCS2A(ncclReduceScatter), \
|
||||
NCCL_FUNCS2A(ncclAllReduce) }
|
||||
NCCL_FUNCS2A(ncclAllReduce), \
|
||||
NCCL_COLL_NAME(ncclSendRecv, copy, i8) }
|
||||
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
using ncclKernelFunc_t = void (*)(struct CollectiveArgs*);
|
||||
@@ -109,7 +110,8 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2A(ncclAllReduce)
|
||||
NCCL_FUNCS2A(ncclAllReduce),
|
||||
NCCL_COLL_NAME(ncclSendRecv, copy, i8)
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -156,7 +158,8 @@ void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
|
||||
else if (c->funcIndex % 9 == 7) ncclAllGatherCollNetLL128_copy_i8(&c->args);
|
||||
else ncclAllGatherCollNet_copy_i8(&c->args);
|
||||
}
|
||||
else Caller<1080, 1800>::call(c);
|
||||
else if (c->funcIndex < 1800) Caller<1080, 1800>::call(c);
|
||||
else ncclSendRecv_copy_i8(&c->args);
|
||||
}
|
||||
|
||||
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
|
||||
@@ -233,13 +236,13 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm) { \
|
||||
\
|
||||
struct ncclChannel* channel = comm->channels+bid; \
|
||||
channel->sync = sync; \
|
||||
if (!load_coll(&localColl, channel->devCollectives+channel->collFifoHead, tid, comm, &abortCount)) { \
|
||||
if (!load_coll(&localColl, channel->collectives+channel->collFifoHead, tid, comm, &abortCount)) { \
|
||||
if (tid == 0) traceAbort(-1); \
|
||||
return; \
|
||||
} \
|
||||
if (tid == 0) traceKernelLaunch(localColl.funcIndex); \
|
||||
while (1) { \
|
||||
if (tid < localColl.args.nThreads) { \
|
||||
if (tid < localColl.args.common.nThreads) { \
|
||||
if (localColl.funcIndex == fIndex) { \
|
||||
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&localColl.args); \
|
||||
} else { \
|
||||
@@ -255,7 +258,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm) { \
|
||||
} \
|
||||
\
|
||||
/* Load next collective operation*/ \
|
||||
if (!load_coll(&localColl, channel->devCollectives+nextIndex, tid, comm, &abortCount)) { \
|
||||
if (!load_coll(&localColl, channel->collectives+nextIndex, tid, comm, &abortCount)) { \
|
||||
if (tid == 0) traceAbort(-1); \
|
||||
break; \
|
||||
} \
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -59,6 +59,7 @@ NCCL_FUNC5(coll, op, dtype) \
|
||||
|
||||
// Must be consistent with ncclFunc_t
|
||||
#define NCCL_FUNCS() { \
|
||||
NCCL_COLL_NAME(ncclSendRecv, copy, i8),\
|
||||
NCCL_FUNCS2B(ncclBroadcast), \
|
||||
NCCL_FUNCS2A(ncclReduce), \
|
||||
NCCL_FUNCS2B(ncclAllGather), \
|
||||
@@ -66,11 +67,12 @@ NCCL_FUNC5(coll, op, dtype) \
|
||||
NCCL_FUNCS2A(ncclAllReduce) }
|
||||
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
__device__ ncclKern_t ncclFuncs[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
|
||||
__device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
|
||||
// Don't try to initialize the host shadow copy of this device-side global
|
||||
// variable. There is no host pointer to a device-side function, which
|
||||
// confuses clang. This will be fixed in the next clang release.
|
||||
#if __CUDA_ARCH__
|
||||
NCCL_COLL_NAME(ncclSendRecv, copy, i8),
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
@@ -9,7 +9,7 @@ dir=$1
|
||||
|
||||
targets="GENOBJS := \\\\\n"
|
||||
|
||||
for base in all_reduce all_gather broadcast reduce reduce_scatter; do
|
||||
for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
|
||||
opn=0
|
||||
for op in sum prod min max; do
|
||||
dtn=0
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -62,13 +62,13 @@ public:
|
||||
|
||||
uint64_t sendStep[NSEND];
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
const T* sendDirectBuff[NRECV];
|
||||
const T* sendDirectBuff[NSEND];
|
||||
#endif
|
||||
T* sendBuff[NSEND];
|
||||
};
|
||||
|
||||
// Implementation of primitive types
|
||||
template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, class FUNC>
|
||||
template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC>
|
||||
class ncclPrimitives {
|
||||
private:
|
||||
const int tid;
|
||||
@@ -94,7 +94,15 @@ class ncclPrimitives {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
__syncthreads();
|
||||
#else
|
||||
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
|
||||
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads+WARP_SIZE));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void subBarrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
__syncthreads();
|
||||
#else
|
||||
asm volatile ("bar.sync 2, %0;" :: "r"(nthreads));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -278,12 +286,12 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
|
||||
r.recvBuff[i] = (const T*)LOAD(&conn->buff);
|
||||
r.recvBuff[i] = (const T*)LOAD(conn->buffs+NCCL_PROTO_SIMPLE);
|
||||
r.recvStep[i] = LOAD(&conn->step);
|
||||
r.recvStep[i] = ROUNDUP(r.recvStep[i], SLICESPERCHUNK*SLICESTEPS);
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
r.recvDirectBuff[i] = NULL;
|
||||
if (directBuff && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
|
||||
if (DIRECT && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
|
||||
r.recvDirectBuff[i] = directBuff;
|
||||
if (tid == 0) STORE(conn->ptrExchange, directBuff);
|
||||
}
|
||||
@@ -307,13 +315,13 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
|
||||
s.sendBuff[i] = (T*)LOAD(&conn->buff);
|
||||
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
|
||||
s.sendBuff[i] = (T*)LOAD(conn->buffs+NCCL_PROTO_SIMPLE);
|
||||
s.sendStep[i] = LOAD(&conn->step);
|
||||
s.sendStep[i] = ROUNDUP(s.sendStep[i], SLICESPERCHUNK*SLICESTEPS);
|
||||
#if defined(RCCL_USE_DIRECT_BUFFER)
|
||||
s.sendDirectBuff[i] = NULL;
|
||||
if (directBuff && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
|
||||
if (DIRECT && LOAD((&conn->direct) & NCCL_DIRECT_GPU)) {
|
||||
void* volatile* ptr = LOAD(&conn->ptrExchange);
|
||||
while ((s.sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
|
||||
barrier();
|
||||
@@ -357,7 +365,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
|
||||
barrier();
|
||||
|
||||
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
|
||||
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, 0);
|
||||
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
|
||||
loadRecvSync();
|
||||
loadSendSync();
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -33,6 +33,7 @@ class ncclLLPrimitives {
|
||||
const int tid;
|
||||
const int nthreads;
|
||||
const int wid;
|
||||
const int stepLines;
|
||||
int nrecv = 0;
|
||||
int nsend = 0;
|
||||
struct ncclDevComm* comm;
|
||||
@@ -42,8 +43,8 @@ class ncclLLPrimitives {
|
||||
typename std::conditional<NSEND == NCCL_MAX_TREE_ARITY,
|
||||
ncclLLPrimitivesSendData<T, NSEND>&, ncclLLPrimitivesSendData<T, NSEND>>::type s;
|
||||
|
||||
inline __device__ int recvOffset(int i) { return (r.recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
||||
inline __device__ int sendOffset(int i) { return (s.sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
||||
inline __device__ int recvOffset(int i) { return (r.recvStep[i]%NCCL_STEPS)*stepLines; }
|
||||
inline __device__ int sendOffset(int i) { return (s.sendStep[i]%NCCL_STEPS)*stepLines; }
|
||||
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return r.recvBuff[i]+recvOffset(i); }
|
||||
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return s.sendBuff[i]+sendOffset(i); }
|
||||
inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(r.recvStep[i]+1); }
|
||||
@@ -92,7 +93,7 @@ class ncclLLPrimitives {
|
||||
if (checkAbort(wid, 1)) break;
|
||||
}
|
||||
if (s.sendConnFifoPtr) {
|
||||
int size = ((s.sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
|
||||
int size = ((s.sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
|
||||
STORE(s.sendConnFifoPtr+s.sendConnHead%NCCL_STEPS, size);
|
||||
}
|
||||
s.sendConnHead += 1;
|
||||
@@ -112,7 +113,7 @@ class ncclLLPrimitives {
|
||||
// LL Cleanup : write all flags in the slice to make sure we don't have
|
||||
// data corruption when flag loops over.
|
||||
if ((s.sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
|
||||
for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
|
||||
for (int o = offset; o<stepLines; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
|
||||
}
|
||||
s.sendStep[i]++;
|
||||
}
|
||||
@@ -212,7 +213,7 @@ class ncclLLPrimitives {
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
|
||||
r.recvBuff[i] = LOAD(&conn->llBuff);
|
||||
r.recvBuff[i] = (union ncclLLFifoLine*)LOAD(conn->buffs+NCCL_PROTO_LL);
|
||||
r.recvStep[i] = LOAD(&conn->step);
|
||||
if (wid == i) r.recvConn = conn;
|
||||
nrecv++;
|
||||
@@ -227,7 +228,7 @@ class ncclLLPrimitives {
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
|
||||
s.sendBuff[i] = LOAD(&conn->llBuff);
|
||||
s.sendBuff[i] = (union ncclLLFifoLine*)LOAD(conn->buffs+NCCL_PROTO_LL);
|
||||
s.sendStep[i] = LOAD(&conn->step);
|
||||
if (wid == i) s.sendConn = conn;
|
||||
nsend++;
|
||||
@@ -270,20 +271,20 @@ class ncclLLPrimitives {
|
||||
|
||||
public:
|
||||
__device__ __forceinline__
|
||||
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount) {
|
||||
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepLines, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepLines(stepLines), opCount(opCount) {
|
||||
init(recvPeers, sendPeers, channel);
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount, ncclLLPrimitivesRecvData<T, NRECV>& r)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount), r(r) {
|
||||
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepLines, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount, ncclLLPrimitivesRecvData<T, NRECV>& r)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepLines(stepLines), opCount(opCount), r(r) {
|
||||
init(recvPeers, sendPeers, channel);
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount, ncclLLPrimitivesSendData<T, NSEND>& s)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount), s(s) {
|
||||
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepLines, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount, ncclLLPrimitivesSendData<T, NSEND>& s)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepLines(stepLines), opCount(opCount), s(s) {
|
||||
init(recvPeers, sendPeers, channel);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -15,6 +15,7 @@ class ncclLL128Primitives {
|
||||
const int tid;
|
||||
const int nthreads;
|
||||
const int wid;
|
||||
const int stepSize;
|
||||
const int warp;
|
||||
const bool flagThread;
|
||||
int nrecv = 0;
|
||||
@@ -40,8 +41,8 @@ class ncclLL128Primitives {
|
||||
volatile uint64_t* shmem;
|
||||
uint32_t* sync;
|
||||
|
||||
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
|
||||
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
|
||||
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
|
||||
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
|
||||
inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
|
||||
inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
|
||||
inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; }
|
||||
@@ -52,9 +53,9 @@ class ncclLL128Primitives {
|
||||
__syncthreads();
|
||||
#else
|
||||
if (NSEND>NRECV) {
|
||||
asm volatile ("bar.sync 2, %0;" :: "r"(nthreads));
|
||||
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
|
||||
} else {
|
||||
asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
|
||||
asm volatile ("bar.sync 2, %0;" :: "r"(nthreads));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -321,7 +322,7 @@ class ncclLL128Primitives {
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
|
||||
recvBuff[i] = LOAD(&conn->ll128Buff);
|
||||
recvBuff[i] = (uint64_t*)LOAD(conn->buffs+NCCL_PROTO_LL128);
|
||||
recvStep[i] = LOAD(&conn->step);
|
||||
if (wid == i) recvConn = conn;
|
||||
nrecv++;
|
||||
@@ -336,7 +337,7 @@ class ncclLL128Primitives {
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
|
||||
sendBuff[i] = LOAD(&conn->ll128Buff);
|
||||
sendBuff[i] = (uint64_t*)LOAD(conn->buffs+NCCL_PROTO_LL128);
|
||||
sendStep[i] = LOAD(&conn->step);
|
||||
if (wid == i) sendConn = conn;
|
||||
nsend++;
|
||||
@@ -375,8 +376,8 @@ class ncclLL128Primitives {
|
||||
|
||||
public:
|
||||
__device__ __forceinline__
|
||||
ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
|
||||
ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
|
||||
// for __any_sync
|
||||
if (NSEND > NRECV)
|
||||
sync = channel->sync + 2 + tid/WARP_SIZE;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -13,29 +13,30 @@ template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
const int nranks = comm->nRanks;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
|
||||
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->root;
|
||||
const int root = args->coll.root;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
@@ -61,30 +62,30 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
const int rank = comm->rank;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = comm->rank;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->root;
|
||||
const int root = args->coll.root;
|
||||
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->lastChunkSize;
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
@@ -112,31 +113,30 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->root;
|
||||
|
||||
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int rank = comm->rank;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->coll.root;
|
||||
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -13,26 +13,27 @@ template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
const int nranks = comm->nRanks;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
|
||||
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
|
||||
|
||||
@@ -75,27 +76,27 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
chunkSize = args->lastChunkSize;
|
||||
chunkSize = args->coll.lastChunkSize;
|
||||
}
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
@@ -140,29 +141,28 @@ template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
const int nthreads = args->coll.nThreads;
|
||||
const int bid = args->coll.bid;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
|
||||
ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount);
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
|
||||
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "sendrecv.h"
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_FUNC(ncclSendRecv, copy, FuncSum, i8, int8_t);
|
||||
IMPL_COLL_KERN(ncclSendRecv, copy, FuncSum, i8, int8_t, 0);
|
||||
@@ -0,0 +1,70 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__device__ void ncclSendRecvKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->p2p.nThreads;
|
||||
|
||||
// Compute pointers
|
||||
const T* sendbuff = (const T*)args->sendbuff;
|
||||
T* recvbuff = (T*)args->recvbuff;
|
||||
|
||||
if (args->p2p.delta < 0 ) return; // No-op
|
||||
|
||||
if (args->p2p.delta == 0) {
|
||||
if (tid < nthreads && sendbuff != recvbuff) {
|
||||
// local copy
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, args->p2p.sendCount);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
|
||||
const ssize_t sendSize = args->p2p.sendCount;
|
||||
const ssize_t recvSize = args->p2p.recvCount;
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize;
|
||||
int peerRecv = recvSize >= 0 ? (comm->rank-(int)args->p2p.delta+comm->nRanks)%comm->nRanks : -1;
|
||||
int peerSend = sendSize >= 0 ? (comm->rank+(int)args->p2p.delta)%comm->nRanks : -1;
|
||||
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &peerRecv, &peerSend, NULL, stepSize, channel, comm, args->opCount);
|
||||
|
||||
int maxSize = sendSize-chunkSize>recvSize ? sendSize-chunkSize : recvSize;
|
||||
|
||||
if (sendSize >= 0) {
|
||||
int realChunkSize = min(chunkSize, sendSize);
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
int nelem = min(realChunkSize, sendSize);
|
||||
prims.send(sendbuff, nelem);
|
||||
}
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < maxSize; gridOffset += chunkSize) {
|
||||
if (gridOffset+chunkSize < sendSize) {
|
||||
int realChunkSize = min(chunkSize, sendSize-gridOffset-chunkSize);
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + chunkSize;
|
||||
int nelem = min(realChunkSize, sendSize-offset);
|
||||
prims.send(sendbuff+offset, nelem);
|
||||
}
|
||||
if (gridOffset < recvSize) {
|
||||
int realChunkSize = min(chunkSize, recvSize-gridOffset);
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset;
|
||||
int nelem = min(realChunkSize, recvSize-offset);
|
||||
prims.recv(recvbuff+offset, nelem);
|
||||
}
|
||||
}
|
||||
if (recvSize == 0) prims.recv(recvbuff,0);
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "enqueue.h"
|
||||
#include "collectives.h"
|
||||
#include "argcheck.h" // Need some checks here since we access comm
|
||||
|
||||
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollSendRecv, "Send",
|
||||
sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
||||
1, 1 };
|
||||
ncclResult_t ret;
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
ret = ncclEnqueueCheck(&info);
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ret;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollSendRecv, "Recv",
|
||||
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
|
||||
1, 1 };
|
||||
ncclResult_t ret;
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
ret = ncclEnqueueCheck(&info);
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ret;
|
||||
}
|
||||
+15
-15
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -18,7 +18,7 @@ pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
void ncclDebugInit() {
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
if (ncclDebugLevel != -1) return;
|
||||
if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
|
||||
const char* nccl_debug = getenv("NCCL_DEBUG");
|
||||
if (nccl_debug == NULL) {
|
||||
ncclDebugLevel = NCCL_LOG_NONE;
|
||||
@@ -61,6 +61,8 @@ void ncclDebugInit() {
|
||||
mask = NCCL_GRAPH;
|
||||
} else if (strcasecmp(subsys, "TUNING") == 0) {
|
||||
mask = NCCL_TUNING;
|
||||
} else if (strcasecmp(subsys, "ENV") == 0) {
|
||||
mask = NCCL_ENV;
|
||||
} else if (strcasecmp(subsys, "ALL") == 0) {
|
||||
mask = NCCL_ALL;
|
||||
}
|
||||
@@ -126,27 +128,32 @@ void ncclDebugInit() {
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
|
||||
if (ncclDebugLevel == -1) ncclDebugInit();
|
||||
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
|
||||
if (ncclDebugLevel < level) return;
|
||||
|
||||
// Gather the rank information. This can take > 1us so we want to make sure
|
||||
// we only do it when needed.
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024, '.');
|
||||
int cudaDev;
|
||||
hipGetDevice(&cudaDev);
|
||||
int pid = getpid();
|
||||
int tid = gettid();
|
||||
|
||||
char buffer[1024];
|
||||
size_t len = 0;
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
|
||||
if (level == NCCL_LOG_WARN)
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
|
||||
else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
|
||||
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
|
||||
else if (level == NCCL_LOG_INFO && (flags & ncclDebugMask))
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
|
||||
"%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
|
||||
#ifdef ENABLE_TRACE
|
||||
else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
|
||||
else if (level == NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
|
||||
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
|
||||
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
|
||||
"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, pid, tid, cudaDev, timestamp, filefunc, line);
|
||||
}
|
||||
#endif
|
||||
if (len) {
|
||||
@@ -158,11 +165,4 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
fflush(ncclDebugFile);
|
||||
}
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
|
||||
// If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
|
||||
if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
|
||||
fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
|
||||
hostname, getpid(), gettid(), cudaDev, filefunc, line);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
+157
-62
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -58,12 +58,13 @@
|
||||
|
||||
typedef void(*ncclKern_t)(struct ncclDevComm*);
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
static ncclKern_t const ncclKerns[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
|
||||
static ncclKern_t const ncclKerns[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2A(ncclAllReduce)
|
||||
NCCL_FUNCS2A(ncclAllReduce),
|
||||
NCCL_KERN_NAME(ncclSendRecv, copy, i8)
|
||||
};
|
||||
|
||||
/*****************************************************************************/
|
||||
@@ -93,11 +94,29 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList,
|
||||
}
|
||||
|
||||
ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
|
||||
params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
|
||||
// Only launch blocks where we have work to do.
|
||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
if (comm->channels[c].collCount) params->gridDim.x = c+1;
|
||||
}
|
||||
|
||||
// Set active = 2 for the last operation
|
||||
for (int r=0; r<params->gridDim.x; r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
// Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
|
||||
for (int c=0; c<params->gridDim.x; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
if (channel->collCount == 0) {
|
||||
int opIndex = channel->collFifoTail;
|
||||
struct ncclColl* c = channel->collectives+opIndex;
|
||||
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
|
||||
while (activePtr[0] != 0) sched_yield();
|
||||
|
||||
c->args.p2p.delta = -1; // no-op
|
||||
c->funcIndex = FUNC_INDEX_P2P;
|
||||
c->args.comm = comm->devComm;
|
||||
c->active = 1;
|
||||
opIndex = (opIndex+1)%NCCL_MAX_OPS;
|
||||
c->nextIndex = opIndex;
|
||||
channel->collFifoTail = opIndex;
|
||||
channel->collCount++;
|
||||
}
|
||||
STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
|
||||
}
|
||||
|
||||
@@ -150,8 +169,8 @@ ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
|
||||
if (comm->nRanks == 1) return ncclSuccess;
|
||||
hipLaunchParams* params = comm->myParams;
|
||||
if (params->gridDim.x == 0) return ncclSuccess;
|
||||
|
||||
NCCLCHECK(setupLaunch(comm, params));
|
||||
|
||||
@@ -170,21 +189,22 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
|
||||
params->stream = comm->userStream;
|
||||
}
|
||||
|
||||
int isLast = 0;
|
||||
NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
|
||||
|
||||
if (isLast) {
|
||||
if (comm->launchMode == ncclComm::GROUP) {
|
||||
if (comm->launchMode == ncclComm::GROUP) {
|
||||
int isLast = 0;
|
||||
NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
|
||||
if (isLast) {
|
||||
// I'm the last. Launch all operations.
|
||||
NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
|
||||
NCCLCHECK(ncclCpuBarrierLast(comm));
|
||||
}
|
||||
NCCLCHECK(ncclCpuBarrierLast(comm));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
|
||||
if (comm->nRanks == 1) return ncclSuccess;
|
||||
hipLaunchParams *params = comm->myParams;
|
||||
if (params->gridDim.x == 0) return ncclSuccess;
|
||||
|
||||
// We can't print the CG mode before the first barrier happened.
|
||||
if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
|
||||
*comm->intraCGMode ^= 0x10;
|
||||
@@ -194,15 +214,16 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
|
||||
(comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclCpuBarrierOut(comm));
|
||||
|
||||
hipLaunchParams *params = comm->myParams;
|
||||
if (comm->launchMode == ncclComm::PARALLEL) {
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
|
||||
} else {
|
||||
NCCLCHECK(ncclCpuBarrierOut(comm));
|
||||
}
|
||||
|
||||
// Start the network proxies as soon as the kernel has been launched. We can't
|
||||
// perform any CUDA call between the two or having a cudaFree between the CUDA
|
||||
// launch and the transportStartProxy call could cause a deadlock.
|
||||
// launch and the ncclProxyStart call could cause a deadlock.
|
||||
// Also, starting the proxies after the CUDA launch seems to be better for
|
||||
// performance (latency).
|
||||
for (int r=0; r<params->gridDim.x; r++) {
|
||||
@@ -212,7 +233,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
|
||||
}
|
||||
params->gridDim.x = params->blockDim.x = 0;
|
||||
comm->lastOpCount = comm->opCount;
|
||||
NCCLCHECK(transportStartProxy(comm));
|
||||
NCCLCHECK(ncclProxyStart(comm));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -324,23 +345,36 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
|
||||
}
|
||||
|
||||
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
|
||||
coll->args.sendbuff = info->sendbuff;
|
||||
coll->args.recvbuff = info->recvbuff;
|
||||
coll->args.comm = info->comm->devComm;
|
||||
coll->args.opCount = info->comm->opCount;
|
||||
|
||||
if (info->coll == ncclCollSendRecv) {
|
||||
coll->args.p2p.sendCount = info->sendbytes;
|
||||
coll->args.p2p.recvCount = info->recvbytes;
|
||||
coll->args.p2p.delta = info->delta;
|
||||
coll->funcIndex = FUNC_INDEX_P2P;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
|
||||
#else
|
||||
coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]+2*WARP_SIZE;
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
// Set nstepsPerLoop and nchunksPerLoop
|
||||
NCCLCHECK(getAlgoInfo(info));
|
||||
NCCLCHECK(getPatternInfo(info));
|
||||
NCCLCHECK(getLoopInfo(info));
|
||||
|
||||
coll->args.root = info->root;
|
||||
coll->args.N = info->count;
|
||||
coll->args.ThisInput = info->sendbuff;
|
||||
coll->args.ThisOutput = info->recvbuff;
|
||||
coll->args.comm = info->comm->devComm;
|
||||
coll->args.opCount = info->comm->opCount;
|
||||
coll->args.nChannels = info->nChannels;
|
||||
coll->args.nThreads = info->nThreads;
|
||||
coll->args.coll.root = info->root;
|
||||
coll->args.coll.count = info->count;
|
||||
coll->args.coll.nChannels = info->nChannels;
|
||||
coll->args.coll.nThreads = info->nThreads;
|
||||
|
||||
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
|
||||
|
||||
int stepSize = (info->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : info->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
|
||||
int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
|
||||
int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
|
||||
int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
|
||||
int chunkSize = stepSize*chunkSteps;
|
||||
@@ -354,25 +388,28 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
}
|
||||
// Use lastChunkSize as chunkSize
|
||||
coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
} else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
|
||||
// Optimize chunkSize / nSteps
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
// Use lastChunkSize as chunkSize
|
||||
coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
} else if (info->protocol == NCCL_PROTO_LL) {
|
||||
int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
|
||||
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
|
||||
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
|
||||
coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
|
||||
ALIGN_SIZE(coll->args.lastChunkSize, info->nThreads*sizeof(uint64_t));
|
||||
coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
|
||||
coll->args.coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
|
||||
ALIGN_SIZE(coll->args.coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
|
||||
coll->args.coll.lastChunkSize /= ncclTypeSize(info->datatype);
|
||||
} else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
|
||||
int nstepsInter = 1+log2i(info->comm->nNodes);
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < nstepsInter*4 && chunkSize > 32768) chunkSize /= 2;
|
||||
int nNodes = info->comm->nNodes;
|
||||
float ppn = info->comm->nRanks / (float)nNodes;
|
||||
float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
|
||||
// Use lastChunkSize as chunkSize
|
||||
coll->args.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
|
||||
coll->args.coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
|
||||
}
|
||||
|
||||
// Compute nSteps for proxies
|
||||
@@ -394,8 +431,19 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t saveKernel(struct ncclInfo* info) {
|
||||
if (info->comm->nRanks == 1) {
|
||||
static ncclResult_t checkSetStream(struct ncclInfo* info) {
|
||||
if (info->comm->userStreamSet == false) {
|
||||
info->comm->userStream = info->stream;
|
||||
info->comm->userStreamSet = true;
|
||||
} else if (info->stream != info->comm->userStream) {
|
||||
WARN("Error : mixing different streams within a group call is not supported.");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
|
||||
if (info->comm->nRanks == 1 && info->coll != ncclCollSendRecv) {
|
||||
if (info->sendbuff != info->recvbuff)
|
||||
CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
|
||||
return ncclSuccess;
|
||||
@@ -406,22 +454,18 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
|
||||
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
|
||||
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
|
||||
|
||||
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
|
||||
if (info->comm->userStreamSet == false) {
|
||||
info->comm->userStream = info->stream;
|
||||
info->comm->userStreamSet = true;
|
||||
} else if (info->stream != info->comm->userStream) {
|
||||
WARN("Error : mixing different streams within a group call is not supported.");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
|
||||
|
||||
int nChannels = info->coll == ncclCollSendRecv ? 1 : coll.args.coll.nChannels;
|
||||
int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
|
||||
for (int bid=0; bid<coll.args.nChannels*nSubChannels; bid++) {
|
||||
int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
|
||||
|
||||
for (int bid=0; bid<nChannels*nSubChannels; bid++) {
|
||||
int channelId = (info->coll == ncclCollSendRecv) ? info->channelId :
|
||||
info->comm->myParams->gridDim.x % info->comm->nChannels;
|
||||
struct ncclChannel* channel = info->comm->channels+channelId;
|
||||
|
||||
if (channel->collCount == NCCL_MAX_OPS) {
|
||||
WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
|
||||
WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
|
||||
@@ -431,18 +475,22 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
|
||||
if (nSubChannels == 2) {
|
||||
info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
|
||||
}
|
||||
NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
|
||||
|
||||
if (info->coll == ncclCollSendRecv) {
|
||||
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
|
||||
NCCLCHECK(ncclProxySaveP2p(info, channel));
|
||||
} else {
|
||||
NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
|
||||
}
|
||||
info->comm->myParams->gridDim.x++;
|
||||
|
||||
int opIndex = channel->collFifoTail;
|
||||
struct ncclColl* c = channel->collectives+opIndex;
|
||||
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
|
||||
while (LOAD(activePtr) != 0) sched_yield();
|
||||
|
||||
memcpy(c, &coll, sizeof(struct ncclColl));
|
||||
if (info->coll != ncclCollSendRecv) c->args.coll.bid = bid % coll.args.coll.nChannels;
|
||||
|
||||
c->args.bid = bid % coll.args.nChannels;
|
||||
STORE(&c->active, 1);
|
||||
opIndex = (opIndex+1)%NCCL_MAX_OPS;
|
||||
c->nextIndex = opIndex;
|
||||
@@ -453,35 +501,82 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Save p2p operations in comm->p2plist. Operations will be posted to channels
|
||||
// during ncclGroupEnd()
|
||||
ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
||||
struct ncclComm* comm = info->comm;
|
||||
struct ncclP2Plist* p2plist = &comm->p2plist;
|
||||
int peer = info->root;
|
||||
p2plist->count++;
|
||||
ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
|
||||
if (info->recvbuff == NULL) {
|
||||
if (peer != comm->rank) {
|
||||
int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].send.connected == 0) {
|
||||
p2plist->connect.send[channelId*comm->nRanks+p2plist->connect.nsend[channelId]++] = peer;
|
||||
}
|
||||
}
|
||||
}
|
||||
p2plist->peerlist[info->root].sendbytes = nBytes;
|
||||
p2plist->peerlist[info->root].sendbuff = info->sendbuff;
|
||||
} else {
|
||||
if (peer != comm->rank) {
|
||||
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].recv.connected == 0) {
|
||||
p2plist->connect.recv[channelId*comm->nRanks+p2plist->connect.nrecv[channelId]++] = peer;
|
||||
}
|
||||
}
|
||||
}
|
||||
p2plist->peerlist[info->root].recvbytes = nBytes;
|
||||
p2plist->peerlist[info->root].recvbuff = info->recvbuff;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
|
||||
if (info->comm == NULL) return ncclInvalidArgument;
|
||||
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
|
||||
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
|
||||
// Launch asynchronously if needed
|
||||
if (ncclAsyncMode()) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int savedDev = -1;
|
||||
// Check arguments
|
||||
NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
|
||||
if (info->comm->checkPointers) {
|
||||
CUDACHECKGOTO(hipGetDevice(&savedDev), ret, end);
|
||||
CUDACHECKGOTO(hipSetDevice(info->comm->cudaDev), ret, end);
|
||||
}
|
||||
// Check arguments
|
||||
NCCLCHECKGOTO(ArgsCheck(info), ret, end);
|
||||
// Always register comm even in case of error to make sure ncclGroupEnd
|
||||
// cleans it up.
|
||||
NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
|
||||
NCCLCHECKGOTO(saveKernel(info), ret, end);
|
||||
NCCLCHECKGOTO(checkSetStream(info), ret, end);
|
||||
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
|
||||
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
|
||||
if (info->coll == ncclCollSendRecv) { //p2p stored separately
|
||||
NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
|
||||
} else {
|
||||
NCCLCHECKGOTO(ncclSaveKernel(info), ret, end);
|
||||
}
|
||||
end:
|
||||
if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
|
||||
ncclAsyncErrCheck(ret);
|
||||
return ret;
|
||||
} else {
|
||||
NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
|
||||
NCCLCHECK(ArgsCheck(info));
|
||||
NCCLCHECK(saveKernel(info));
|
||||
NCCLCHECK(checkSetStream(info));
|
||||
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
|
||||
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
|
||||
NCCLCHECK(ncclSaveKernel(info));
|
||||
NCCLCHECK(ncclBarrierEnqueue(info->comm));
|
||||
NCCLCHECK(ncclBarrierEnqueueWait(info->comm));
|
||||
NCCLCHECK(ncclEnqueueEvents(info->comm));
|
||||
|
||||
+91
-11
@@ -10,6 +10,7 @@
|
||||
#include "topo.h"
|
||||
#include "comm.h"
|
||||
#include "net.h"
|
||||
#include "channel.h"
|
||||
|
||||
// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
|
||||
|
||||
@@ -232,15 +233,16 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
|
||||
}
|
||||
}
|
||||
}
|
||||
if (l >= 0) INFO(NCCL_GRAPH, "%s set from environment to %s", levelEnv, topoPathTypeStr[l]);
|
||||
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
|
||||
*level = l >= 0 ? l : -2;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int ncclTopoUserP2pLevel = -1;
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p) {
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read) {
|
||||
*p2p = 0;
|
||||
*read = 0;
|
||||
|
||||
// Get GPUs from topology
|
||||
int g1, g2;
|
||||
@@ -255,21 +257,33 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
|
||||
// In general, use P2P whenever we can.
|
||||
int p2pLevel = PATH_SYS;
|
||||
|
||||
// User override
|
||||
if (ncclTopoUserP2pLevel == -1)
|
||||
NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
|
||||
if (ncclTopoUserP2pLevel != -2) {
|
||||
p2pLevel = ncclTopoUserP2pLevel;
|
||||
goto compare;
|
||||
}
|
||||
|
||||
// Don't use P2P through ARM CPUs
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 &&
|
||||
vendor == NCCL_TOPO_CPU_VENDOR_INTEL &&
|
||||
model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
|
||||
|
||||
// User override
|
||||
NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
|
||||
if (ncclTopoUserP2pLevel != -2) p2pLevel = ncclTopoUserP2pLevel;
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
|
||||
if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
|
||||
else p2pLevel = PATH_SYS;
|
||||
}
|
||||
|
||||
compare:
|
||||
// Compute the PCI distance and compare with the p2pLevel.
|
||||
if (path->type <= p2pLevel) *p2p = 1;
|
||||
|
||||
if (path->type == PATH_NVL) {
|
||||
struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
|
||||
// Enable P2P Read for Ampere/NVLink only
|
||||
if ((gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -346,8 +360,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
|
||||
|
||||
// Update path when we don't want to / can't use GPU Direct P2P
|
||||
for (int p=0; p<system->nodes[GPU].count; p++) {
|
||||
int p2p;
|
||||
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p));
|
||||
int p2p, read;
|
||||
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, &read));
|
||||
if (p2p == 0) {
|
||||
// Divert all traffic through the CPU
|
||||
int cpu;
|
||||
@@ -442,3 +456,69 @@ void ncclTopoFree(struct ncclTopoSystem* system) {
|
||||
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
|
||||
free(system);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
|
||||
int peer;
|
||||
struct ncclTopoLinkList* path = NULL;
|
||||
if (ncclTopoRankToIndex(system, peerRank, &peer) == ncclSuccess) {
|
||||
// Same rank
|
||||
if (g == peer) {
|
||||
*nChannels = -1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
// Local rank
|
||||
path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
|
||||
if (path->type == PATH_NVL) {
|
||||
int sm = system->nodes[GPU].nodes[g].gpu.cudaCompCap;
|
||||
double nvlWidth = sm < 70 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
|
||||
*nChannels = 2*std::max(1, (int)(path->width / nvlWidth));
|
||||
} else {
|
||||
*nChannels = 2;
|
||||
}
|
||||
} else {
|
||||
// Remote rank, use network
|
||||
*nChannels = 1;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1);
|
||||
NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);
|
||||
|
||||
static int nextPow2(int v) {
|
||||
int pow2 = 1;
|
||||
while (pow2 < v) pow2 <<= 1;
|
||||
return pow2;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
|
||||
comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
|
||||
int minChannels = comm->p2pnChannels;
|
||||
// We need to loop through all local GPUs to have a global picture
|
||||
for (int g=0; g<comm->topo->nodes[GPU].count; g++) {
|
||||
for (int r=0; r<comm->nRanks; r++) {
|
||||
int nChannels;
|
||||
NCCLCHECK(ncclTopoGetNchannels(comm->topo, g, r, &nChannels));
|
||||
if (nChannels >= 0) minChannels = std::min(minChannels, nChannels);
|
||||
}
|
||||
}
|
||||
|
||||
// Round to next pow2 nChannelsPerPeer and nChannels
|
||||
comm->p2pnChannelsPerPeer = nextPow2(minChannels);
|
||||
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
|
||||
|
||||
// Init channels that weren't used so far
|
||||
for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
|
||||
|
||||
// We want to spread channels used when there aren't many and progressively
|
||||
// fill the whole space of nChannels. To do so we mirror the bits in the
|
||||
// nChannels space.
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int mirror = 0;
|
||||
for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
|
||||
comm->p2pChannels[c] = mirror;
|
||||
}
|
||||
INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+40
-18
@@ -14,17 +14,11 @@
|
||||
// Initialize system->maxWidth. This is the per-channel (i.e. per-SM)
|
||||
// max speed.
|
||||
static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
float nvLinkWidth = VEGA_XGMI_WIDTH;
|
||||
#else
|
||||
float nvLinkWidth = gpu->gpu.cudaCompCap > 60 ? VOLTA_NVLINK_WIDTH : PASCAL_NVLINK_WIDTH;
|
||||
#endif
|
||||
float maxWidth = 0.0;
|
||||
for (int i=0; i<system->nodes[type].count; i++) {
|
||||
struct ncclTopoLinkList* path = gpu->paths[type]+i;
|
||||
float width = path->width;
|
||||
if (path->count == 0) continue;
|
||||
if (path->type == PATH_NVL) width = std::min(nvLinkWidth, width);
|
||||
maxWidth = std::max(maxWidth, width);
|
||||
}
|
||||
return maxWidth;
|
||||
@@ -78,7 +72,7 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
|
||||
struct ncclTopoLink* revLink = NULL;
|
||||
float fwSpeed = link->type == LINK_PCI ? pciSpeed : speed;
|
||||
float revSpeed = 0;
|
||||
if (link->remNode->type == GPU && start->type != GPU) {
|
||||
if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) {
|
||||
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
|
||||
revSpeed += fwSpeed/8;
|
||||
}
|
||||
@@ -364,6 +358,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
|
||||
if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
|
||||
if (net) {
|
||||
@@ -432,13 +427,15 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
}
|
||||
if (graph->nChannels == 0 || graph->sameChannels == 0) {
|
||||
if (graph->nChannels == 0) {
|
||||
// Always try the PCI order first to set a reference
|
||||
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
|
||||
struct ncclTopoLinkList* paths = net->paths[GPU];
|
||||
// find the first GPU that is closest to NIC
|
||||
int f = 0;
|
||||
for (int i = 0; i<system->nodes[GPU].count; i++)
|
||||
if (paths[i].count < paths[f].count) f = i;
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, NET, n, f));
|
||||
int t = 1 << 10;
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, f));
|
||||
if (t == -1) *time = -1;
|
||||
}
|
||||
|
||||
// Then try the most local GPUs
|
||||
@@ -571,7 +568,7 @@ ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, st
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels) {
|
||||
int id;
|
||||
NCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id));
|
||||
if (graph->id != id) return ncclSuccess;
|
||||
@@ -594,11 +591,12 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
|
||||
for (int s=0; s<xmlGraph->nSubs; s++) {
|
||||
NCCLCHECK(ncclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph));
|
||||
}
|
||||
*nChannels = xmlGraph->nSubs;
|
||||
return ncclSuccess;
|
||||
}
|
||||
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels) {
|
||||
for (int s=0; s<xmlGraphs->nSubs; s++) {
|
||||
NCCLCHECK(ncclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph));
|
||||
NCCLCHECK(ncclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph, nChannels));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -771,7 +769,11 @@ static void parseChordalRing(struct ncclTopoSystem* system, char **str) {
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
float speedArray[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#else
|
||||
float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#endif
|
||||
#define NSPEEDS (sizeof(speedArray)/sizeof(float))
|
||||
|
||||
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
@@ -786,10 +788,13 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
|
||||
char* str = getenv("NCCL_GRAPH_FILE");
|
||||
if (str) {
|
||||
INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str);
|
||||
struct ncclXml* xml;
|
||||
NCCLCHECK(ncclCalloc(&xml, 1));
|
||||
NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml));
|
||||
NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph));
|
||||
int nChannels;
|
||||
NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels));
|
||||
INFO(NCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels);
|
||||
free(xml);
|
||||
if (graph->nChannels > 0) return ncclSuccess;
|
||||
}
|
||||
@@ -937,6 +942,15 @@ done:
|
||||
graph->typeIntra = graph->typeInter = PATH_SYS;
|
||||
graph->nChannels = 1;
|
||||
}
|
||||
|
||||
if (graph->speedIntra >= 25.0) {
|
||||
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
|
||||
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
|
||||
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
|
||||
graph->speedIntra /= 2;
|
||||
graph->speedInter /= 2;
|
||||
graph->nChannels = dupChannels;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -968,6 +982,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
|
||||
ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) {
|
||||
char* str = getenv("NCCL_GRAPH_DUMP_FILE");
|
||||
if (str) {
|
||||
INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str);
|
||||
struct ncclXml* xml;
|
||||
NCCLCHECK(ncclCalloc(&xml, 1));
|
||||
NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
|
||||
@@ -977,10 +992,17 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int rank, int channelId, int* dev) {
|
||||
int channel = channelId%graph->nChannels;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
|
||||
*dev = graph->inter[channel*2+index];
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* dev) {
|
||||
if (graph) {
|
||||
// Honor the net device in the graph
|
||||
int channel = channelId%graph->nChannels;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
|
||||
*dev = graph->inter[channel*2+index];
|
||||
} else {
|
||||
int64_t id;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, channelId));
|
||||
*dev = id;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -571,6 +571,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
NCCLCHECK(ncclCalloc(&xml, 1));
|
||||
char* xmlTopoFile = getenv("NCCL_TOPO_FILE");
|
||||
if (xmlTopoFile) {
|
||||
INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
|
||||
NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml));
|
||||
}
|
||||
if (xml->maxIndex == 0) {
|
||||
@@ -629,6 +630,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
|
||||
xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
|
||||
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
|
||||
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
|
||||
NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
|
||||
}
|
||||
|
||||
@@ -637,6 +639,28 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr) {
|
||||
int g;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
|
||||
int minType = PATH_SYS;
|
||||
float maxWidth = 0;
|
||||
int count = 0;
|
||||
int* nets;
|
||||
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g;
|
||||
if (path->width > maxWidth || (path->width == maxWidth && path->type < minType)) {
|
||||
maxWidth = path->width;
|
||||
minType = path->type;
|
||||
count = 0;
|
||||
}
|
||||
if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
|
||||
}
|
||||
*id = nets[rr % count];
|
||||
free(nets);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/****************************/
|
||||
/* External query functions */
|
||||
/****************************/
|
||||
|
||||
+14
-1
@@ -128,8 +128,10 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
|
||||
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
|
||||
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr);
|
||||
|
||||
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
|
||||
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
|
||||
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
|
||||
ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
|
||||
|
||||
static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
|
||||
@@ -143,4 +145,15 @@ static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, i
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) {
|
||||
*index = -1;
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
if (system->nodes[GPU].nodes[i].gpu.rank == rank) {
|
||||
*index = i;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
+37
-21
@@ -52,10 +52,6 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
|
||||
static const char* ncclAlgoStr[] = { "Tree", "Ring", "CollNet" };
|
||||
static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" };
|
||||
|
||||
// Latencies in us, Bandwidths in GB/s
|
||||
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
|
||||
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 37.9, 37.9, 40.4 }, { 20.5, 20.5, 27.9 }, { 37.9, 37.9, 40.4 } };
|
||||
@@ -74,10 +70,11 @@ static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 9.8, 9.8, 19.5 }, /* Ring (LL/LL128/Simple)*/ { 2.0, 2.0, 4.5 }, /* CollNet (LL/LL128/Simple)*/ { 9.8, 9.8, 19.5 } }
|
||||
};
|
||||
|
||||
// LL128 max BW for the different collectives
|
||||
static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 };
|
||||
// LL128 max BW (per channel) for the different collectives
|
||||
// ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce
|
||||
static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.7 };
|
||||
|
||||
ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
|
||||
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS;
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
|
||||
@@ -90,6 +87,8 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
|
||||
|
||||
if (comm->nRanks <= 1) return ncclSuccess;
|
||||
|
||||
int compCap80 = minCompCap == 80 && maxCompCap == 80 ? 1 : 0;
|
||||
float ppn = (float)comm->nRanks / comm->nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
|
||||
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
|
||||
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
|
||||
@@ -99,6 +98,9 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
|
||||
int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
|
||||
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
|
||||
comm->nRanks;
|
||||
int nInterSteps = coll == ncclCollAllReduce ? 2*(comm->nNodes-1) :
|
||||
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nNodes-1 :
|
||||
comm->nNodes;
|
||||
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
if (coll != ncclCollAllReduce && a != NCCL_ALGO_RING) continue;
|
||||
@@ -106,13 +108,17 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
|
||||
float busBw = graphs[a]->nChannels * speed;
|
||||
if (compCap80) busBw *= 0.92;
|
||||
|
||||
// Various model refinements
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/5.0;
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw*120.0/128.0, ll128MaxBw[coll]);
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
|
||||
double maxTreeBw = comm->nNodes > 2 ?
|
||||
compCap80 && p == NCCL_PROTO_LL128 ? 105.0 : 80.0 :
|
||||
compCap80 && p == NCCL_PROTO_LL128 ? 130.0 : 110.0;
|
||||
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.27, comm->nNodes > 1 ? 70.0 : 90.0);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/2.3;
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0;
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (comm->nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels*7.0/9.0);
|
||||
if (a == NCCL_ALGO_COLLNET) busBw *= .9;
|
||||
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
|
||||
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128
|
||||
@@ -122,6 +128,9 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
|
||||
comm->bandwidths[coll][a][p] = busBw * ratio;
|
||||
|
||||
comm->latencies[coll][a][p] = baseLat[a][p];
|
||||
float intraLat = hwLat[intraHw[a]][a][p];
|
||||
float interLat = hwLat[NCCL_HW_NET][a][p];
|
||||
if (comm->nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
|
||||
if (a == NCCL_ALGO_RING) {
|
||||
float lat = hwLat[hw[a]][a][p];
|
||||
if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
|
||||
@@ -132,16 +141,12 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
|
||||
comm->latencies[coll][a][p] += nsteps*lat;
|
||||
}
|
||||
} else {
|
||||
comm->latencies[coll][a][p] += nsteps*lat;
|
||||
comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
|
||||
}
|
||||
} else if (a == NCCL_ALGO_TREE) {
|
||||
float intraLat = hwLat[intraHw[a]][a][p];
|
||||
float interLat = hwLat[NCCL_HW_NET][a][p];
|
||||
comm->latencies[coll][a][p] +=
|
||||
2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
|
||||
} else {
|
||||
float intraLat = hwLat[intraHw[a]][a][p];
|
||||
float interLat = hwLat[NCCL_HW_NET][a][p];
|
||||
comm->latencies[coll][a][p] +=
|
||||
2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat;
|
||||
}
|
||||
@@ -155,17 +160,26 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
|
||||
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1 };
|
||||
|
||||
const char *protoStr = getenv("NCCL_PROTO");
|
||||
if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
|
||||
if (protoStr) {
|
||||
INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr);
|
||||
NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
|
||||
}
|
||||
const char *algoStr = getenv("NCCL_ALGO");
|
||||
if (algoStr) NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
|
||||
if (algoStr) {
|
||||
INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
|
||||
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
|
||||
}
|
||||
|
||||
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
int pEnable = protoEnable[p];
|
||||
if (pEnable == 2 && p == NCCL_PROTO_LL128) {
|
||||
// Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption.
|
||||
pEnable = (graphs[a]->typeInter <= LINK_PCI) && graphs[a]->typeIntra == LINK_NVL && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
|
||||
// Enable LL128 by default only on Volta/Ampere+NVLink. Other cases are not tested and may cause silent data corruption.
|
||||
pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
|
||||
((minCompCap == 70 && maxCompCap == 70) || (minCompCap == 80 && maxCompCap == 80)) ? 1 : 0;
|
||||
}
|
||||
if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
|
||||
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
|
||||
// Only disable algo for Allreduce since others only have one
|
||||
if (c == ncclCollAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
|
||||
}
|
||||
|
||||
if (comm->rank == 0) {
|
||||
@@ -206,6 +220,7 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
|
||||
// Override defaults with user env
|
||||
char* str = getenv("NCCL_THREAD_THRESHOLDS");
|
||||
if (str) {
|
||||
INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
|
||||
ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2}};
|
||||
sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
@@ -229,7 +244,7 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
|
||||
}
|
||||
|
||||
// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
|
||||
// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
|
||||
// factor is not ideal but works quite well. Powers of two, 64 B to 128MB.
|
||||
static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
|
||||
{ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .84, .49, .42, .60, .75, .87, .94, .94, .99, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 },
|
||||
{ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .84, .49, .42, .60, .75, .87, .94, .94, .99, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 },
|
||||
@@ -244,12 +259,13 @@ static float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
|
||||
|
||||
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time) {
|
||||
float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
|
||||
float lat = info->comm->latencies[info->coll][algorithm][protocol];
|
||||
if (bw == 0) {
|
||||
*time = -1.0; return ncclSuccess;
|
||||
}
|
||||
int logSize = log2i(info->nBytes>>6);
|
||||
if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
else if (algorithm == NCCL_ALGO_RING && logSize < 22) bw *= ringCorrectionFactor[protocol][logSize];
|
||||
*time = info->comm->latencies[info->coll][algorithm][protocol] + (info->nBytes) / (1000 * bw);
|
||||
*time = lat + (info->nBytes) / (1000 * bw);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -10,6 +10,10 @@
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <ctype.h>
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#endif
|
||||
#include "core.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "xml.h"
|
||||
@@ -628,7 +632,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
|
||||
}
|
||||
#else
|
||||
// NVML NVLink detection
|
||||
int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : 6;
|
||||
int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;
|
||||
|
||||
if (maxNvLinks > 0 && nvmlDev == NULL) {
|
||||
WARN("No NVML device handle. Skipping nvlink detection.\n");
|
||||
|
||||
+140
-25
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -8,6 +8,7 @@
|
||||
#include "group.h"
|
||||
#include "debug.h"
|
||||
#include "enqueue.h"
|
||||
#include "transport.h"
|
||||
|
||||
#define MAX_ASYNC_OPS 128
|
||||
thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
|
||||
@@ -34,6 +35,7 @@ struct ncclInitArgs {
|
||||
};
|
||||
struct ncclCollArgs {
|
||||
ncclComm_t comm;
|
||||
int connect;
|
||||
};
|
||||
|
||||
enum ncclAsyncFuncType {
|
||||
@@ -52,16 +54,24 @@ struct ncclAsyncArgs {
|
||||
|
||||
thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
|
||||
|
||||
#define CHECK(a) do { \
|
||||
#define NCCLCHECKTHREAD(a) do { \
|
||||
if ((args->ret = (a)) != ncclSuccess) { \
|
||||
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
|
||||
return args; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define CUDACHECKTHREAD(a) do { \
|
||||
if ((a) != hipSuccess) { \
|
||||
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
|
||||
args->ret = ncclUnhandledCudaError; \
|
||||
return args; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
void* ncclAsyncThreadMain(void* args_) {
|
||||
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
|
||||
CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
|
||||
NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
|
||||
return args;
|
||||
}
|
||||
|
||||
@@ -100,20 +110,50 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupStart);
|
||||
ncclResult_t ncclGroupStart() {
|
||||
if (ncclGroupMode == 0) {
|
||||
memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
|
||||
}
|
||||
ncclGroupMode++;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
|
||||
struct ncclInfo info = { ncclCollSendRecv, "SendRecv",
|
||||
sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
|
||||
1, 1 };
|
||||
info.delta = delta;
|
||||
info.channelId = channelId;
|
||||
info.sendbytes = sendbytes;
|
||||
info.recvbytes = recvbytes;
|
||||
if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
|
||||
NCCLCHECK(ncclSaveKernel(&info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void* ncclAsyncThreadPreconnect(void* args_) {
|
||||
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
|
||||
CUDACHECKTHREAD(hipSetDevice(args->coll.comm->cudaDev));
|
||||
for (int c=0; c<args->coll.comm->p2pnChannels; c++) {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
struct ncclP2PConnect* connect = &comm->p2plist.connect;
|
||||
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*comm->nRanks, connect->nsend[c], connect->send+c*comm->nRanks));
|
||||
connect->nrecv[c] = 0;
|
||||
connect->nsend[c] = 0;
|
||||
}
|
||||
return args;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupEnd);
|
||||
ncclResult_t ncclGroupEnd() {
|
||||
if (ncclGroupMode == 0) return ncclInvalidUsage;
|
||||
ncclGroupMode--;
|
||||
if (ncclGroupMode > 0) return ncclSuccess;
|
||||
int savedDev;
|
||||
CUDACHECK(hipGetDevice(&savedDev));
|
||||
int done = ncclGroupIndex;
|
||||
int activeThreads = 0;
|
||||
int doneArray[MAX_ASYNC_OPS];
|
||||
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
|
||||
ncclResult_t ret = ncclGroupError;
|
||||
if (ret != ncclSuccess) goto group_cleanup;
|
||||
|
||||
@@ -122,6 +162,97 @@ ncclResult_t ncclGroupEnd() {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_INIT) {
|
||||
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args);
|
||||
activeThreads++;
|
||||
doneArray[i] = 0;
|
||||
}
|
||||
}
|
||||
/* For init, since we use threads, we just wait for threads to complete */
|
||||
while (activeThreads) {
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
|
||||
int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
|
||||
if (err == EBUSY) continue;
|
||||
if (err != 0) ret = ncclSystemError;
|
||||
if (args->ret != ncclSuccess) ret = args->ret;
|
||||
doneArray[i] = 1;
|
||||
activeThreads--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
|
||||
if (p2plist->count != 0) {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
args->coll.connect = 0;
|
||||
for (int c=0; c<comm->p2pnChannels; c++)
|
||||
args->coll.connect += comm->p2plist.connect.nsend[c] + comm->p2plist.connect.nrecv[c];
|
||||
if (args->coll.connect) {
|
||||
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL && (args->coll.connect)) {
|
||||
int err = pthread_join(ncclGroupThreads[i], NULL);
|
||||
if (err != 0) {
|
||||
WARN("Error waiting for pthread_join : %s\n", strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
NCCLCHECKGOTO(args->ret, ret, end);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
int rank = comm->rank;
|
||||
int nRanks = comm->nRanks;
|
||||
struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
|
||||
if (p2plist->count) {
|
||||
for (int delta=0; delta<nRanks; delta++) {
|
||||
uint32_t from = (rank+nRanks-delta)%nRanks;
|
||||
uint32_t to = (rank+delta)%nRanks;
|
||||
|
||||
// Compute how much to split operations
|
||||
// Natural step size matching buffer steps.
|
||||
ssize_t stepSize = 4*comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
|
||||
// Split each operation on p2pnChannelsPerPeer max.
|
||||
ssize_t recvChunkSize = DIVUP(p2plist->peerlist[from].recvbytes, comm->p2pnChannelsPerPeer);
|
||||
ssize_t sendChunkSize = DIVUP(p2plist->peerlist[to].sendbytes, comm->p2pnChannelsPerPeer);
|
||||
recvChunkSize = std::max((ssize_t)1, DIVUP(recvChunkSize, stepSize)) * stepSize;
|
||||
sendChunkSize = std::max((ssize_t)1, DIVUP(sendChunkSize, stepSize)) * stepSize;
|
||||
|
||||
ssize_t sendOffset = 0;
|
||||
ssize_t recvOffset = 0;
|
||||
int remaining = 1;
|
||||
int chunk = 0;
|
||||
while (remaining) {
|
||||
int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
|
||||
remaining = 0;
|
||||
ssize_t recvbytes = p2plist->peerlist[from].recvbytes-recvOffset;
|
||||
ssize_t sendbytes = p2plist->peerlist[to].sendbytes-sendOffset;
|
||||
if (recvbytes > recvChunkSize) { remaining = 1; recvbytes = recvChunkSize; } else p2plist->peerlist[from].recvbytes = -1;
|
||||
if (sendbytes > sendChunkSize) { remaining = 1; sendbytes = sendChunkSize; } else p2plist->peerlist[to].sendbytes = -1;
|
||||
if (sendbytes >= 0 || recvbytes >= 0) {
|
||||
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
|
||||
recvbytes, ((char*)(p2plist->peerlist[from].recvbuff)) + recvOffset,
|
||||
sendbytes, ((const char*)(p2plist->peerlist[to].sendbuff)) + sendOffset), ret, end);
|
||||
}
|
||||
recvOffset += recvChunkSize;
|
||||
sendOffset += sendChunkSize;
|
||||
chunk++;
|
||||
}
|
||||
}
|
||||
p2plist->count = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,25 +286,9 @@ ncclResult_t ncclGroupEnd() {
|
||||
if (args->coll.comm->userStream == NULL)
|
||||
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
|
||||
NCCLCHECKGOTO(ncclEnqueueEvents(args->coll.comm), ret, end);
|
||||
doneArray[i] = 1;
|
||||
done--;
|
||||
}
|
||||
}
|
||||
|
||||
/* For init, since we use threads, we just wait for threads to complete */
|
||||
while (done) {
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
|
||||
int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
|
||||
if (err == EBUSY) continue;
|
||||
if (err != 0) ret = ncclSystemError;
|
||||
if (args->ret != ncclSuccess) ret = args->ret;
|
||||
doneArray[i] = 1;
|
||||
done--;
|
||||
}
|
||||
}
|
||||
}
|
||||
goto end;
|
||||
group_cleanup:
|
||||
if (ret != ncclSuccess) {
|
||||
@@ -181,12 +296,12 @@ group_cleanup:
|
||||
// an atomic operation, we need to cancel all operations.
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
|
||||
if (args->init.newcomm) NCCLCHECK(ncclCommDestroy(*args->init.newcomm));
|
||||
if (args->funcType == ASYNC_FUNC_INIT) {
|
||||
if (args->init.newcomm) ncclCommDestroy(*args->init.newcomm);
|
||||
*args->init.newcomm = NULL;
|
||||
} else {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
for (int i=0; i<channel->collCount; i++) {
|
||||
channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -13,10 +13,10 @@
|
||||
#include "align.h"
|
||||
#include <sys/mman.h>
|
||||
|
||||
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
|
||||
CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
|
||||
memset(*ptr, 0, size);
|
||||
*devPtr = *ptr;
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaHostCalloc(T** ptr, size_t nelem) {
|
||||
CUDACHECK(hipHostMalloc(ptr, nelem*sizeof(T), hipHostMallocMapped));
|
||||
memset(*ptr, 0, nelem*sizeof(T));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -12,17 +12,17 @@
|
||||
|
||||
// Check CUDA calls
|
||||
#define CUDACHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
|
||||
hipError_t err = cmd; \
|
||||
if( err != hipSuccess ) { \
|
||||
WARN("HIP failure '%s'", hipGetErrorString(err)); \
|
||||
return ncclUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUDACHECKGOTO(cmd, res, label) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
|
||||
hipError_t err = cmd; \
|
||||
if( err != hipSuccess ) { \
|
||||
WARN("HIP failure '%s'", hipGetErrorString(err)); \
|
||||
res = ncclUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -8,10 +8,8 @@
|
||||
#ifndef NCCL_COLLECTIVES_H_
|
||||
#define NCCL_COLLECTIVES_H_
|
||||
|
||||
#include "core.h"
|
||||
#include "info.h"
|
||||
|
||||
#define FUNC_INDEX(coll, redop, dtype, al, pr) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
|
||||
#define FUNC_INDEX_P2P 1800
|
||||
#define FUNC_INDEX(coll, redop, dtype, al, pr) (((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)
|
||||
|
||||
#define NCCL_COLL_NAME(coll, op, dtype) \
|
||||
coll##_##op##_##dtype
|
||||
@@ -58,6 +56,7 @@
|
||||
DECL_COLL2(ncclAllGather, copy) \
|
||||
DECL_COLL(ncclReduceScatter) \
|
||||
DECL_COLL(ncclAllReduce) \
|
||||
DECL_COLL5(ncclSendRecv,copy,i8) \
|
||||
|
||||
DECL_ALL_COLLS
|
||||
|
||||
@@ -78,5 +77,6 @@ DECL_ALL_COLLS
|
||||
#define BROADCAST_CHUNKSTEPS 1
|
||||
#define REDUCE_SLICESTEPS 1
|
||||
#define REDUCE_CHUNKSTEPS 1
|
||||
#define SENDRECV_SLICEFACTOR 4
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -9,6 +9,7 @@
|
||||
#define NCCL_COMM_H_
|
||||
|
||||
#include "transport.h"
|
||||
#include "p2p.h"
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
@@ -44,6 +45,7 @@ struct ncclSendMem {
|
||||
};
|
||||
char pad3[MEM_ALIGN];
|
||||
};
|
||||
char buff[1]; // Actually larger than that
|
||||
};
|
||||
|
||||
struct ncclRecvMem {
|
||||
@@ -57,8 +59,6 @@ struct ncclRecvMem {
|
||||
};
|
||||
char pad4[MEM_ALIGN];
|
||||
};
|
||||
ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
|
||||
uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS];
|
||||
char buff[1]; // Actually larger than that
|
||||
};
|
||||
|
||||
@@ -92,6 +92,13 @@ struct ncclComm {
|
||||
|
||||
// Channels for collectives
|
||||
int nChannels;
|
||||
// Channels (per peer) for p2p
|
||||
int p2pnChannels;
|
||||
int p2pnChannelsPerPeer;
|
||||
int p2pChannels[MAXCHANNELS];
|
||||
|
||||
// Buffer sizes
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
|
||||
// Algorithm/Protocols thresholds
|
||||
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
@@ -138,6 +145,8 @@ struct ncclComm {
|
||||
|
||||
// Whether this communicator uses collNet
|
||||
int collNetSupport;
|
||||
//list of async p2p operation queued in a group semantics
|
||||
struct ncclP2Plist p2plist;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -52,19 +52,6 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
|
||||
}
|
||||
}
|
||||
|
||||
#define NCCL_NUM_FUNCTIONS 5
|
||||
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t;
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
|
||||
#define NCCL_ALGO_TREE 0
|
||||
#define NCCL_ALGO_RING 1
|
||||
#define NCCL_ALGO_COLLNET 2
|
||||
|
||||
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
||||
#define NCCL_PROTO_LL 0
|
||||
#define NCCL_PROTO_LL128 1
|
||||
#define NCCL_PROTO_SIMPLE 2
|
||||
|
||||
#include "debug.h"
|
||||
#include "checks.h"
|
||||
#include "alloc.h"
|
||||
|
||||
+45
-30
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -22,6 +22,22 @@
|
||||
#define STORE(DST, SRC) *(DST) = (SRC)
|
||||
#endif
|
||||
|
||||
#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
|
||||
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollSendRecv} ncclFunc_t;
|
||||
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
|
||||
#define NCCL_ALGO_TREE 0
|
||||
#define NCCL_ALGO_RING 1
|
||||
#define NCCL_ALGO_COLLNET 2
|
||||
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
|
||||
|
||||
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
||||
#define NCCL_PROTO_LL 0
|
||||
#define NCCL_PROTO_LL128 1
|
||||
#define NCCL_PROTO_SIMPLE 2
|
||||
extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
|
||||
|
||||
#define NCCL_MAX_OPS 2048
|
||||
#define NCCL_STEPS 8
|
||||
|
||||
@@ -45,9 +61,6 @@ union ncclLLFifoLine {
|
||||
#define NCCL_MAX_NTHREADS 256
|
||||
#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
|
||||
#define NCCL_LL_LINES_PER_THREAD 8
|
||||
#define NCCL_LL_SLICE_LINES (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
|
||||
#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
|
||||
#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
|
||||
#ifdef TEST_LL_CLEANUP
|
||||
#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
|
||||
#define NCCL_LL_FLAG_MAX 0x100
|
||||
@@ -68,13 +81,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
|
||||
|
||||
// Receiving from up to 3 sources is more compute intensive than sending
|
||||
// to 3 dests. Use 70% for reduce and 30% for bcast.
|
||||
#define NCCL_LL128_SPLIT(nt) (nt/2)
|
||||
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
|
||||
|
||||
#define NCCL_LL128_SLICE_ELEMS (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
|
||||
#define NCCL_LL128_BUFF_ELEMS (NCCL_LL128_SLICE_ELEMS*NCCL_STEPS)
|
||||
#define NCCL_LL128_BUFF_SIZE (NCCL_LL128_BUFF_ELEMS*sizeof(uint64_t))
|
||||
|
||||
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 2
|
||||
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
|
||||
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
|
||||
|
||||
#define NCCL_DIRECT_GPU 0x01
|
||||
@@ -82,7 +91,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
|
||||
|
||||
struct ncclConnInfo {
|
||||
// Regular comm mechanism
|
||||
char *buff; // Local for recv, remote for send
|
||||
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
|
||||
uint64_t *tail; // Local for recv, remote for send
|
||||
uint64_t *head; // Local for send, remote for recv
|
||||
uint64_t *opCountLoc; // opCount of local rank
|
||||
@@ -94,9 +103,6 @@ struct ncclConnInfo {
|
||||
int *fifo; // Size fifo for proxy
|
||||
|
||||
uint64_t step; // Keep where we are
|
||||
|
||||
// Low latency mechanism
|
||||
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
|
||||
uint64_t llLastCleaning;
|
||||
|
||||
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
|
||||
@@ -104,9 +110,6 @@ struct ncclConnInfo {
|
||||
// descriptions in primitives.h.
|
||||
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
|
||||
// High bandwidth, low latency protocol
|
||||
uint64_t* ll128Buff; // Local for recv, remote for send
|
||||
};
|
||||
|
||||
struct ncclConnector {
|
||||
@@ -155,17 +158,31 @@ struct CollectiveArgs {
|
||||
uint64_t opCount;
|
||||
|
||||
// local and remote input, output, and buffer
|
||||
const void * ThisInput;
|
||||
void * ThisOutput;
|
||||
const void * sendbuff;
|
||||
void * recvbuff;
|
||||
|
||||
// general parameters
|
||||
size_t N;
|
||||
uint32_t root;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
uint16_t nThreads;
|
||||
|
||||
int lastChunkSize;
|
||||
// Op-specific fields. Make sure the common part stays the
|
||||
// same on all structs of the union
|
||||
union {
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
} common;
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
uint32_t root;
|
||||
size_t count;
|
||||
size_t lastChunkSize;
|
||||
} coll;
|
||||
struct {
|
||||
uint16_t nThreads;
|
||||
uint16_t unused;
|
||||
int32_t delta;
|
||||
size_t sendCount;
|
||||
size_t recvCount;
|
||||
} p2p;
|
||||
};
|
||||
};
|
||||
struct ncclColl {
|
||||
union {
|
||||
@@ -190,8 +207,6 @@ struct ncclChannel {
|
||||
struct ncclTree collTreeDn;
|
||||
|
||||
int id;
|
||||
int nthreads;
|
||||
int buffSize;
|
||||
|
||||
// Communication structures
|
||||
struct ncclPeer* peers;
|
||||
@@ -199,7 +214,6 @@ struct ncclChannel {
|
||||
|
||||
// Operation list for aggregation
|
||||
struct ncclColl* collectives;
|
||||
struct ncclColl* devCollectives;
|
||||
int collStart;
|
||||
int collCount;
|
||||
int collFifoHead; // Only used by GPU
|
||||
@@ -282,6 +296,7 @@ typedef enum {
|
||||
struct ncclDevComm {
|
||||
int rank;
|
||||
int nRanks;
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
volatile uint32_t *abortFlag;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -12,11 +12,12 @@
|
||||
#include "collectives.h"
|
||||
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
|
||||
ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
|
||||
ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
|
||||
ncclResult_t ncclBarrierEnqueue(ncclComm_t comm);
|
||||
ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm);
|
||||
ncclResult_t ncclEnqueueEvents(ncclComm_t comm);
|
||||
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
|
||||
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
|
||||
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
|
||||
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
|
||||
ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
|
||||
ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
|
||||
ncclResult_t ncclSaveKernel(struct ncclInfo* info);
|
||||
|
||||
#endif // End include guard
|
||||
|
||||
@@ -25,10 +25,11 @@ ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
|
||||
void ncclTopoFree(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
||||
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int rank, int channelId, int* net);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p);
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
|
||||
|
||||
// Set CPU affinity
|
||||
@@ -96,7 +97,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
|
||||
|
||||
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
|
||||
|
||||
ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
|
||||
#include "info.h"
|
||||
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time);
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -9,7 +9,7 @@
|
||||
#define NCCL_INFO_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
|
||||
typedef enum {
|
||||
ncclPatternRing,
|
||||
@@ -48,6 +48,10 @@ struct ncclInfo {
|
||||
size_t nBytes;
|
||||
int nstepsPerLoop;
|
||||
int nchunksPerLoop;
|
||||
ssize_t sendbytes;
|
||||
ssize_t recvbytes;
|
||||
uint32_t delta;
|
||||
int channelId;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
#define NCCL_PTR_CUDA 0x2
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifndef NCCL_P2P_H_
|
||||
#define NCCL_P2P_H_
|
||||
|
||||
struct ncclP2Pinfo {
|
||||
const void* sendbuff;
|
||||
void* recvbuff;
|
||||
ssize_t sendbytes;
|
||||
ssize_t recvbytes;
|
||||
};
|
||||
|
||||
struct ncclP2PConnect {
|
||||
int nrecv[MAXCHANNELS];
|
||||
int nsend[MAXCHANNELS];
|
||||
int* recv;
|
||||
int* send;
|
||||
};
|
||||
|
||||
struct ncclP2Plist {
|
||||
struct ncclP2Pinfo *peerlist;
|
||||
int count;
|
||||
struct ncclP2PConnect connect;
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,77 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROXY_H_
|
||||
#define NCCL_PROXY_H_
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
|
||||
|
||||
struct ncclProxyArgs;
|
||||
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
|
||||
|
||||
struct ncclProxyArgs {
|
||||
proxyProgressFunc_t progress;
|
||||
struct ncclChannel* channel;
|
||||
struct ncclConnector* connector;
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int nsteps;
|
||||
uint64_t opCount;
|
||||
int protocol;
|
||||
ncclDataType_t dtype;
|
||||
ncclRedOp_t redOp;
|
||||
int state; // add component before this line -- it is left out during initialization
|
||||
|
||||
// Internal state
|
||||
uint64_t head;
|
||||
uint64_t tail;
|
||||
uint64_t end;
|
||||
void* requests[NCCL_STEPS];
|
||||
int idle;
|
||||
|
||||
// Element linking
|
||||
pthread_mutex_t mutex;
|
||||
struct ncclProxyArgs* next;
|
||||
struct ncclProxyArgs* nextPeer;
|
||||
};
|
||||
|
||||
struct ncclProxyPool;
|
||||
struct ncclProxyState {
|
||||
pthread_cond_t cond;
|
||||
pthread_mutex_t mutex;
|
||||
bool stop;
|
||||
struct ncclProxyArgs* ops;
|
||||
struct ncclProxyArgs* pool;
|
||||
struct ncclProxyPool* pools;
|
||||
};
|
||||
|
||||
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
|
||||
|
||||
enum proxyMode {
|
||||
proxyRing = 0,
|
||||
proxyFrom = 1,
|
||||
proxyTo = 2
|
||||
};
|
||||
|
||||
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
|
||||
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel);
|
||||
ncclResult_t ncclProxyStart(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
// Spin wait until func evaluates to true
|
||||
template<typename FUNC>
|
||||
inline void transportProxyWait(const FUNC& func) {
|
||||
while (!func()) {
|
||||
sched_yield();
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -53,6 +53,8 @@ static inline int envSocketFamily(void) {
|
||||
if (env == NULL)
|
||||
return family;
|
||||
|
||||
INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env);
|
||||
|
||||
if (strcmp(env, "AF_INET") == 0)
|
||||
family = AF_INET; // IPv4
|
||||
else if (strcmp(env, "AF_INET6") == 0)
|
||||
@@ -290,6 +292,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam
|
||||
// User specified interface
|
||||
char* env = getenv("NCCL_SOCKET_IFNAME");
|
||||
if (env && strlen(env) > 1) {
|
||||
INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
|
||||
// Specified by user : find or fail
|
||||
if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
|
||||
nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
@@ -301,7 +304,8 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam
|
||||
if (nIfs == 0) {
|
||||
char* commId = getenv("NCCL_COMM_ID");
|
||||
if (commId && strlen(commId) > 1) {
|
||||
// Try to find interface that is in the same subnet as the IP in comm id
|
||||
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
|
||||
// Try to find interface that is in the same subnet as the IP in comm id
|
||||
union socketAddress idAddr;
|
||||
GetSocketAddrFromString(&idAddr, commId);
|
||||
nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "graph.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "core.h"
|
||||
#include "proxy.h"
|
||||
|
||||
#define NTRANSPORTS 3
|
||||
#define TRANSPORT_P2P 0
|
||||
@@ -39,49 +40,8 @@ struct ncclConnect {
|
||||
char data[CONNECT_SIZE];
|
||||
};
|
||||
|
||||
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
|
||||
|
||||
struct ncclProxyArgs;
|
||||
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
|
||||
|
||||
struct ncclProxyArgs {
|
||||
proxyProgressFunc_t progress;
|
||||
struct ncclChannel* channel;
|
||||
struct ncclConnector* connector;
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int nsteps;
|
||||
uint64_t opCount;
|
||||
int protocol;
|
||||
ncclDataType_t dtype;
|
||||
ncclRedOp_t redOp;
|
||||
int state; // add component before this line -- it is left out during initialization
|
||||
|
||||
// Internal state
|
||||
uint64_t head;
|
||||
uint64_t tail;
|
||||
uint64_t end;
|
||||
void* requests[NCCL_STEPS];
|
||||
int idle;
|
||||
|
||||
// Element linking
|
||||
pthread_mutex_t mutex;
|
||||
struct ncclProxyArgs* next;
|
||||
struct ncclProxyArgs* nextPeer;
|
||||
};
|
||||
|
||||
struct ncclProxyPool;
|
||||
struct ncclProxyState {
|
||||
pthread_cond_t cond;
|
||||
pthread_mutex_t mutex;
|
||||
bool stop;
|
||||
struct ncclProxyArgs* ops;
|
||||
struct ncclProxyArgs* pool;
|
||||
struct ncclProxyPool* pools;
|
||||
};
|
||||
|
||||
struct ncclTransportComm {
|
||||
ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
|
||||
ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
|
||||
ncclResult_t (*connect)(struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
|
||||
ncclResult_t (*free)(void*);
|
||||
ncclResult_t (*proxy)(struct ncclProxyArgs*);
|
||||
@@ -94,30 +54,6 @@ struct ncclTransport {
|
||||
struct ncclTransportComm recv;
|
||||
};
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
|
||||
|
||||
enum proxyMode {
|
||||
proxyRing = 0,
|
||||
proxyFrom = 1,
|
||||
proxyTo = 2
|
||||
};
|
||||
|
||||
ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr);
|
||||
ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks);
|
||||
ncclResult_t transportStartProxy(struct ncclComm* comm);
|
||||
ncclResult_t transportCreateProxy(struct ncclComm* comm);
|
||||
ncclResult_t transportDestroyProxy(struct ncclComm* comm);
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
// Spin wait until func evaluates to true
|
||||
template<typename FUNC>
|
||||
inline void transportProxyWait(const FUNC& func) {
|
||||
while (!func()) {
|
||||
sched_yield();
|
||||
}
|
||||
}
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
|
||||
|
||||
#endif
|
||||
|
||||
+69
-83
@@ -41,6 +41,10 @@ std::chrono::high_resolution_clock::time_point ncclEpoch;
|
||||
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
|
||||
#endif
|
||||
|
||||
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
|
||||
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
|
||||
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
|
||||
|
||||
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
|
||||
|
||||
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
|
||||
@@ -120,7 +124,7 @@ static ncclResult_t ncclInit() {
|
||||
pthread_mutex_lock(&initLock);
|
||||
if (!initialized) {
|
||||
initEnv();
|
||||
initNet();
|
||||
NCCLCHECK(initNet());
|
||||
INFO(NCCL_INIT, "Using network %s", ncclNetName());
|
||||
initialized = true;
|
||||
}
|
||||
@@ -206,6 +210,9 @@ void *ncclCommThreadMain(void *arg) {
|
||||
static ncclResult_t commFree(ncclComm_t comm) {
|
||||
if (comm == NULL)
|
||||
return ncclSuccess;
|
||||
free(comm->p2plist.peerlist);
|
||||
free(comm->p2plist.connect.recv);
|
||||
free(comm->p2plist.connect.send);
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf));
|
||||
@@ -252,7 +259,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
CUDACHECK(hipFree(comm->hostDevComm.channels));
|
||||
CUDACHECK(hipFree(comm->devComm));
|
||||
|
||||
for (int channel=0; channel<comm->nChannels; channel++)
|
||||
for (int channel=0; channel<MAXCHANNELS; channel++)
|
||||
NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
|
||||
|
||||
if (comm->doneEvent != NULL)
|
||||
@@ -316,10 +323,12 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
#endif
|
||||
comm->fatalError = ncclSuccess;
|
||||
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
|
||||
NCCLCHECK(ncclCudaHostCalloc((ncclDevError_t**)&comm->fatalDevError, 1));
|
||||
comm->hostDevComm.fatalDevError = comm->fatalDevError;
|
||||
STORE(comm->fatalDevError, ncclDevSuccess);
|
||||
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
|
||||
NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1));
|
||||
comm->hostDevComm.abortFlag = comm->abortFlag;
|
||||
STORE(comm->abortFlag, 0);
|
||||
|
||||
comm->argsptr = &comm->args;
|
||||
@@ -338,6 +347,14 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
comm->hostDevComm.collTraceThread = 0;
|
||||
#endif
|
||||
comm->collNetSupport = 0;
|
||||
comm->p2plist.count=0;
|
||||
NCCLCHECK(ncclCalloc(&comm->p2plist.peerlist, comm->nRanks));
|
||||
for (int r=0; r<comm->nRanks; r++) comm->p2plist.peerlist[r].sendbytes = comm->p2plist.peerlist[r].recvbytes = -1;
|
||||
NCCLCHECK(ncclCalloc(&comm->p2plist.connect.recv, MAXCHANNELS*comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&comm->p2plist.connect.send, MAXCHANNELS*comm->nRanks));
|
||||
|
||||
// Mark channels as non initialized.
|
||||
for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
|
||||
|
||||
*comret = comm;
|
||||
return ncclSuccess;
|
||||
@@ -345,13 +362,12 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
|
||||
static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
// Duplicate the channels on the device
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels));
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels));
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels));
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels));
|
||||
|
||||
// Copy userRanks and peers
|
||||
for (int r=0; r<comm->nChannels; r++) {
|
||||
for (int r=0; r<comm->p2pnChannels; r++) {
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks+1));
|
||||
}
|
||||
|
||||
// Duplicate the dev comm on the device
|
||||
@@ -396,23 +412,6 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <int type>
|
||||
static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
|
||||
for (int t=0; t<NTRANSPORTS; t++) {
|
||||
struct ncclTransport *transport = ncclTransports+t;
|
||||
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
|
||||
int ret = 0;
|
||||
NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
|
||||
if (ret) {
|
||||
connector->transportComm = transportComm;
|
||||
NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId));
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("No transport found !");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) {
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
|
||||
NCCLCHECK(initChannel(comm, channelId));
|
||||
@@ -485,6 +484,7 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
|
||||
// Set CG Mode
|
||||
comm->launchMode = ncclComm::GROUP;
|
||||
char* str = getenv("NCCL_LAUNCH_MODE");
|
||||
if (str) INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", str);
|
||||
if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
|
||||
comm->launchMode = ncclComm::PARALLEL;
|
||||
}
|
||||
@@ -505,50 +505,26 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
|
||||
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
|
||||
uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
|
||||
struct ncclConnect connect;
|
||||
struct ncclConnector* conn;
|
||||
for (int i=0; i<nrecv; i++) {
|
||||
int peer = peerRecv[i];
|
||||
if (peer == -1 || peer >= comm->nRanks) continue;
|
||||
conn = &channel->peers[peer].recv;
|
||||
if (conn->connected) { ++nSkippedRecv; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
|
||||
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
#define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
|
||||
#define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
|
||||
#define DEFAULT_BUFFSIZE (1LL << 22) /* 4MiB */
|
||||
#define DEFAULT_BUFFSIZE_ARM (1LL << 20) /* 1MiB */
|
||||
NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
|
||||
NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
|
||||
NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
|
||||
|
||||
static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
|
||||
int cpuArch, cpuVendor, cpuModel;
|
||||
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
|
||||
|
||||
int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() };
|
||||
int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE };
|
||||
|
||||
if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
comm->buffSizes[p] = comm->hostDevComm.buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
|
||||
}
|
||||
for (int i=0; i<nsend; i++) {
|
||||
int peer = peerSend[i];
|
||||
if (peer == -1 || peer >= comm->nRanks) continue;
|
||||
conn = &channel->peers[peer].send;
|
||||
if (conn->connected) { ++nSkippedSend; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
|
||||
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
}
|
||||
for (int i=0; i<nsend; i++) {
|
||||
int peer = peerSend[i];
|
||||
if (peer == -1 || peer >= comm->nRanks) continue;
|
||||
conn = &channel->peers[peer].send;
|
||||
if (conn->connected) {++nSkippedSend; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
}
|
||||
for (int i=0; i<nrecv; i++) {
|
||||
int peer = peerRecv[i];
|
||||
if (peer == -1 || peer >= comm->nRanks) continue;
|
||||
conn = &channel->peers[peer].recv;
|
||||
if (conn->connected) {++nSkippedRecv; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
}
|
||||
TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -557,7 +533,8 @@ extern struct ncclTransport collNetTransport;
|
||||
// All ranks must participate in collNetSetup call
|
||||
// type: 0 for send, 1 for recv
|
||||
// return: 0 - unsupported, 1 - supported
|
||||
static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int collNetChannels, int rank, int nranks, int masterRank, int masterPeer, int nMasters, int type) {
|
||||
// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
|
||||
static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int rank, int nranks, int masterRank, int masterPeer, int nMasters, int type) {
|
||||
int rankInCollNet = -1;
|
||||
int supported = 0;
|
||||
int isMaster = (rank == masterRank) ? 1 : 0;
|
||||
@@ -589,7 +566,7 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
|
||||
// setup
|
||||
struct ncclConnect myConnect;
|
||||
if (isMaster && ret > 0) {
|
||||
NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->buffSize, channel->id));
|
||||
NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id));
|
||||
}
|
||||
// prepare connect handles
|
||||
ncclResult_t res;
|
||||
@@ -620,12 +597,15 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
|
||||
// connect
|
||||
if (isMaster && ret > 0) {
|
||||
NCCLCHECKGOTO(transportComm->connect(masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
|
||||
struct ncclPeer* devRoot = channel->devPeers+nranks;
|
||||
struct ncclConnector* devConn = (type == 1) ? &devRoot->recv : &devRoot->send;
|
||||
CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
|
||||
}
|
||||
// recv side sends connect info to send side
|
||||
if (isMaster && type == 1) {
|
||||
sendrecvExchange.collNetRank = rankInCollNet;
|
||||
memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect));
|
||||
NCCLCHECK(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)));
|
||||
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
|
||||
INFO(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
|
||||
}
|
||||
if (ret > 0) {
|
||||
@@ -852,7 +832,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
|
||||
if (comm->nNodes > 1 &&
|
||||
ncclParamCollNetEnable() == 1 &&
|
||||
collNetSupport()) {
|
||||
collNetSupport() && collNetGraph.nChannels) {
|
||||
NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank));
|
||||
}
|
||||
|
||||
@@ -864,7 +844,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
|
||||
|
||||
NCCLCHECK(ncclTopoSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
|
||||
NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
|
||||
|
||||
char line[1024];
|
||||
line[0]='\0';
|
||||
@@ -885,6 +865,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECK(ncclTopoSetAffinity(comm->topo, comm->rank));
|
||||
ncclResult_t ret;
|
||||
|
||||
NCCLCHECK(computeBuffSizes(comm));
|
||||
|
||||
// Connect with prev/next for each ring
|
||||
struct ncclConnect *connect;
|
||||
NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore);
|
||||
@@ -892,15 +874,15 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
|
||||
if (comm->nRanks == 1) continue;
|
||||
NCCLCHECKGOTO(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore);
|
||||
}
|
||||
|
||||
// Check if we can setup CollNet
|
||||
if (comm->nNodes > 1 &&
|
||||
ncclParamCollNetEnable() == 1 &&
|
||||
collNetSupport()) {
|
||||
collNetSupport() && collNetGraph.nChannels) {
|
||||
int logicChannels = comm->nChannels/2;
|
||||
int collNetSetupFail = 0;
|
||||
const int recvIndex = 0; // recv GPU index is always 0
|
||||
@@ -908,13 +890,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
for (int c=0; c<logicChannels; c++) {
|
||||
struct ncclChannel* channelRecv = comm->channels+logicChannels+c;
|
||||
struct ncclChannel* channelSend = comm->channels+c;
|
||||
NCCLCHECK(p2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down));
|
||||
NCCLCHECK(p2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up));
|
||||
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down));
|
||||
NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up));
|
||||
const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex];
|
||||
const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex];
|
||||
if (collNetSetup(comm, &collNetGraph, channelRecv, logicChannels, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
|
||||
if (collNetSetup(comm, &collNetGraph, channelRecv, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
|
||||
collNetSetupFail = 1;
|
||||
if (collNetSetup(comm, &collNetGraph, channelSend, logicChannels, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
|
||||
else if (collNetSetup(comm, &collNetGraph, channelSend, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
|
||||
collNetSetupFail = 1;
|
||||
}
|
||||
// Verify CollNet setup across ranks
|
||||
@@ -924,6 +906,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
free(connect);
|
||||
free(rings);
|
||||
|
||||
// Compute nChannels per peer for p2p
|
||||
NCCLCHECK(ncclTopoComputeP2pChannels(comm));
|
||||
|
||||
// We should have allocated all buffers, collective fifos, ... we can
|
||||
// restore the affinity.
|
||||
affinity_restore:
|
||||
@@ -952,7 +937,7 @@ affinity_restore:
|
||||
// Done with AllGather1 data
|
||||
free(allGather1Data);
|
||||
|
||||
if (comm->nNodes) NCCLCHECK(transportCreateProxy(comm));
|
||||
if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
||||
return ncclSuccess;
|
||||
@@ -979,6 +964,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
|
||||
ncclResult_t res;
|
||||
char* env = getenv("NCCL_COMM_ID");
|
||||
if (env && myrank == 0) {
|
||||
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
|
||||
NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, end);
|
||||
}
|
||||
|
||||
@@ -1047,7 +1033,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
|
||||
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, LOAD(comm->abortFlag), comm->fatalError);
|
||||
|
||||
CUDACHECK(hipStreamSynchronize(comm->groupStream));
|
||||
NCCLCHECK(transportDestroyProxy(comm));
|
||||
NCCLCHECK(ncclProxyDestroy(comm));
|
||||
NCCLCHECK(commFree(comm));
|
||||
|
||||
if (savedDevice != commDevice)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -35,7 +35,6 @@ ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
|
||||
}
|
||||
|
||||
ncclResult_t ArgsCheck(struct ncclInfo* info) {
|
||||
NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
|
||||
// First, the easy ones
|
||||
if (info->root < 0 || info->root >= info->comm->nRanks) {
|
||||
WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
|
||||
@@ -45,7 +44,7 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
|
||||
WARN("%s : invalid type %d", info->opName, info->datatype);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
// Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars.
|
||||
// Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
|
||||
info->nBytes = info->count * ncclTypeSize(info->datatype);
|
||||
if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) {
|
||||
info->count = info->nBytes;
|
||||
@@ -59,12 +58,20 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
|
||||
}
|
||||
|
||||
if (info->comm->checkPointers) {
|
||||
// Check CUDA device pointers
|
||||
if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
|
||||
NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
|
||||
}
|
||||
if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
|
||||
NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
|
||||
if (info->coll == ncclCollSendRecv) {
|
||||
if (strcmp(info->opName, "Send") == 0) {
|
||||
NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", "Send"));
|
||||
} else {
|
||||
NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", "Recv"));
|
||||
}
|
||||
} else {
|
||||
// Check CUDA device pointers
|
||||
if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
|
||||
NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
|
||||
}
|
||||
if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
|
||||
NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -95,6 +95,7 @@ uint64_t getHostHash(void) {
|
||||
int offset = strlen(hostHash);
|
||||
|
||||
if ((hostId = getenv("NCCL_HOSTID")) != NULL) {
|
||||
INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId);
|
||||
strncpy(hostHash, hostId, sizeof(hostHash));
|
||||
} else {
|
||||
FILE *file = fopen(HOSTID_FILE, "r");
|
||||
|
||||
+46
-6
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -237,6 +237,40 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
|
||||
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
/*
|
||||
* Send
|
||||
*
|
||||
* Send data from sendbuff to rank peer.
|
||||
*
|
||||
* Rank peer needs to call ncclRecv with the same datatype and the same count from this
|
||||
* rank.
|
||||
*
|
||||
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
|
||||
* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
|
||||
* ncclGroupEnd section.
|
||||
*/
|
||||
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
/*
|
||||
* Receive
|
||||
*
|
||||
* Receive data from rank peer into recvbuff.
|
||||
*
|
||||
* Rank peer needs to call ncclSend with the same datatype and the same count to this
|
||||
* rank.
|
||||
*
|
||||
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
|
||||
* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
|
||||
* ncclGroupEnd section.
|
||||
*/
|
||||
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
/*
|
||||
* Group semantics
|
||||
*
|
||||
@@ -252,21 +286,27 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
|
||||
* the operation is effectively done.
|
||||
*
|
||||
* Both collective communication and ncclCommInitRank can be used in conjunction
|
||||
* of ncclGroupStart/ncclGroupEnd.
|
||||
* of ncclGroupStart/ncclGroupEnd, but not together.
|
||||
*
|
||||
* Group semantics also allow to fuse multiple operations on the same device
|
||||
* to improve performance (for aggregated collective calls), or to permit
|
||||
* concurrent progress of multiple send/receive operations.
|
||||
*/
|
||||
|
||||
/*! @brief Group Start
|
||||
*
|
||||
* @details Start a group call. All subsequent calls to NCCL may not block due to
|
||||
* inter-CPU synchronization.
|
||||
* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
|
||||
* a single NCCL operation. Nothing will be started on the CUDA stream until
|
||||
* ncclGroupEnd.
|
||||
*/
|
||||
ncclResult_t ncclGroupStart();
|
||||
ncclResult_t pncclGroupStart();
|
||||
|
||||
/*! @brief Group End
|
||||
*
|
||||
* @details End a group call. Wait for all calls since ncclGroupStart to complete
|
||||
* before returning.
|
||||
* End a group call. Start a fused NCCL operation consisting of all calls since
|
||||
* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
|
||||
* need to be called after ncclGroupEnd.
|
||||
*/
|
||||
ncclResult_t ncclGroupEnd();
|
||||
ncclResult_t pncclGroupEnd();
|
||||
|
||||
@@ -0,0 +1,283 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "comm.h"
|
||||
#include "info.h"
|
||||
#include "collectives.h"
|
||||
|
||||
#define RECV 0
|
||||
#define SEND 1
|
||||
|
||||
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
|
||||
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
|
||||
|
||||
/* In chains, one rank does not need a proxy. Let's figure out which one it is */
|
||||
// Which index in the reorganized rings should we compare root against */
|
||||
const int myrank = 0, nextrank = 1, prevrank = nranks-1;
|
||||
int index = pattern == ncclPatternPipelineFrom ?
|
||||
/* no recv / no send if root = */
|
||||
/* bcast */ (type == RECV ? myrank : nextrank ):
|
||||
/* reduce */ (type == RECV ? prevrank : myrank );
|
||||
int rank = ring->userRanks[index];
|
||||
return (root != rank);
|
||||
}
|
||||
|
||||
enum { proxyRecv=0, proxySend=1 };
|
||||
|
||||
#define PROXYARGS_ALLOCATE_SIZE 32
|
||||
struct ncclProxyPool {
|
||||
struct ncclProxyPool *next;
|
||||
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
|
||||
};
|
||||
|
||||
static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
struct ncclProxyArgs* elem;
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
if (state->pool == NULL) {
|
||||
// Allocate a new pool of elements
|
||||
struct ncclProxyPool* newPool;
|
||||
NCCLCHECK(ncclCalloc(&newPool, 1));
|
||||
struct ncclProxyArgs* newElems = newPool->elems;
|
||||
// Chain newly allocated elements
|
||||
for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
|
||||
if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
|
||||
}
|
||||
// Add them all to the pool list
|
||||
state->pool = newElems;
|
||||
// Save the pool memory block for later resource release
|
||||
newPool->next = state->pools;
|
||||
state->pools = newPool;
|
||||
}
|
||||
elem = state->pool;
|
||||
state->pool = state->pool->next;
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
elem->next = elem->nextPeer = NULL;
|
||||
*argsptr = elem;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) {
|
||||
struct ncclComm* comm = connector->comm;
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
if (connector->proxyAppend == NULL) {
|
||||
// Nothing running for that peer. Add to the circular list
|
||||
if (state->ops == NULL) {
|
||||
// Create the list
|
||||
args->next = args;
|
||||
state->ops = args;
|
||||
} else {
|
||||
// Insert element in the list
|
||||
args->next = state->ops->next;
|
||||
state->ops->next = args;
|
||||
}
|
||||
connector->proxyAppend = args;
|
||||
} else {
|
||||
// There is an active operation already for that peer.
|
||||
// Add it to the per-peer list
|
||||
connector->proxyAppend->nextPeer = args;
|
||||
connector->proxyAppend = args;
|
||||
}
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
}
|
||||
|
||||
template <int type>
|
||||
static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
|
||||
if (peer < 0) return ncclSuccess;
|
||||
|
||||
struct ncclPeer* peerComm = args->channel->peers+peer;
|
||||
struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
|
||||
if (connector->transportComm == NULL) {
|
||||
WARN("[%d] Error no transport for %s peer %d on channel %d\n", connector->comm->rank,
|
||||
type == proxyRecv ? "recv" : "send", peer, args->channel->id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (connector->transportComm->proxy == NULL) return ncclSuccess;
|
||||
|
||||
struct ncclProxyArgs* op;
|
||||
NCCLCHECK(allocateArgs(connector->comm, &op));
|
||||
memcpy(op, args, sizeof(struct ncclProxyArgs));
|
||||
op->connector = connector;
|
||||
op->progress = connector->transportComm->proxy;
|
||||
op->state = ncclProxyOpReady;
|
||||
ProxyAppend(connector, op);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
|
||||
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
|
||||
struct ncclRing* ring = &args->channel->ring;
|
||||
if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args));
|
||||
if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args));
|
||||
}
|
||||
if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
|
||||
// Tree up
|
||||
struct ncclTree* tree = &args->channel->treeUp;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
|
||||
NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
|
||||
}
|
||||
if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
|
||||
// Tree down
|
||||
struct ncclTree* tree = &args->channel->treeDn;
|
||||
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
|
||||
}
|
||||
if (pattern == ncclPatternCollTreeUp) {
|
||||
// CollTree up
|
||||
struct ncclTree* tree = &args->channel->collTreeUp;
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args));
|
||||
NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
|
||||
}
|
||||
if (pattern == ncclPatternCollTreeDown) {
|
||||
// CollTree down
|
||||
struct ncclTree* tree = &args->channel->collTreeDn;
|
||||
NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args));
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel) {
|
||||
struct ncclProxyArgs args;
|
||||
memset(&args, 0, sizeof(struct ncclProxyArgs));
|
||||
args.channel = channel;
|
||||
args.sliceSteps = 1;
|
||||
args.chunkSteps = 1;
|
||||
args.protocol = NCCL_PROTO_SIMPLE;
|
||||
args.opCount = info->comm->opCount;
|
||||
args.dtype = info->datatype;
|
||||
if (info->delta > 0 && info->sendbytes >= 0) {
|
||||
int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
|
||||
args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS);
|
||||
if (args.nsteps == 0) args.nsteps = 1;
|
||||
NCCLCHECK(SaveProxy<proxySend>(peersend, &args));
|
||||
}
|
||||
if (info->delta > 0 && info->recvbytes >= 0) {
|
||||
int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks;
|
||||
args.nsteps = DIVUP(info->recvbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS);
|
||||
if (args.nsteps == 0) args.nsteps = 1;
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(peerrecv, &args));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void* persistentThread(void *comm_) {
|
||||
struct ncclComm* comm = (struct ncclComm*)comm_;
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
struct ncclProxyArgs* op = NULL;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int idle = 1;
|
||||
int idleSpin = 0;
|
||||
while (1) {
|
||||
do {
|
||||
if (*comm->abortFlag) return NULL;
|
||||
if (op == NULL) {
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
op = state->ops;
|
||||
if (op == NULL) {
|
||||
if (state->stop) {
|
||||
// No more commands to process and proxy has been requested to stop
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
return NULL;
|
||||
}
|
||||
pthread_cond_wait(&state->cond, &state->mutex);
|
||||
}
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
}
|
||||
} while (op == NULL);
|
||||
op->idle = 0;
|
||||
// opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
|
||||
// yet and might be cancelled before they even start. Hold on on those.
|
||||
if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
|
||||
if (ret != ncclSuccess) {
|
||||
comm->fatalError = ret;
|
||||
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
|
||||
return NULL;
|
||||
}
|
||||
idle &= op->idle;
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
if (!idle) idleSpin = 0;
|
||||
struct ncclProxyArgs *next = op->next;
|
||||
if (next->state == ncclProxyOpNone) {
|
||||
struct ncclProxyArgs *freeOp = next;
|
||||
if (next->nextPeer) {
|
||||
// Replace next by its next per-peer element.
|
||||
next = next->nextPeer;
|
||||
if (op != freeOp) {
|
||||
next->next = freeOp->next;
|
||||
op->next = next;
|
||||
} else {
|
||||
next->next = next;
|
||||
}
|
||||
} else {
|
||||
// Remove next from circular list
|
||||
next->connector->proxyAppend = NULL;
|
||||
if (op != freeOp) {
|
||||
next = next->next;
|
||||
op->next = next;
|
||||
} else {
|
||||
next = NULL;
|
||||
}
|
||||
}
|
||||
if (freeOp == state->ops) state->ops = next;
|
||||
freeOp->next = state->pool;
|
||||
state->pool = freeOp;
|
||||
}
|
||||
op = next;
|
||||
if (op == state->ops) {
|
||||
if (idle == 1) {
|
||||
if (++idleSpin == 10) {
|
||||
sched_yield();
|
||||
idleSpin = 0;
|
||||
}
|
||||
}
|
||||
idle = 1;
|
||||
}
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxyStart(struct ncclComm* comm) {
|
||||
pthread_mutex_lock(&comm->proxyState.mutex);
|
||||
if (comm->proxyState.ops != NULL)
|
||||
pthread_cond_signal(&comm->proxyState.cond);
|
||||
pthread_mutex_unlock(&comm->proxyState.mutex);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
|
||||
if (!comm->proxyThread) {
|
||||
comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
|
||||
comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
comm->proxyState.ops = NULL;
|
||||
pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
|
||||
// Request the proxy to stop and then wake it
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
state->stop = true;
|
||||
pthread_cond_signal(&state->cond);
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
|
||||
|
||||
// Free off any memory allocated for the proxy arg pools
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
struct ncclProxyState* proxyState = &comm->proxyState;
|
||||
while (proxyState->pools != NULL) {
|
||||
struct ncclProxyPool *next = proxyState->pools->next;
|
||||
free(proxyState->pools);
|
||||
proxyState->pools = next;
|
||||
}
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
+60
-239
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "comm.h"
|
||||
#include "info.h"
|
||||
#include "bootstrap.h"
|
||||
|
||||
extern struct ncclTransport p2pTransport;
|
||||
extern struct ncclTransport shmTransport;
|
||||
@@ -18,248 +19,68 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
|
||||
netTransport,
|
||||
};
|
||||
|
||||
#define RECV 0
|
||||
#define SEND 1
|
||||
|
||||
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
|
||||
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
|
||||
|
||||
/* In chains, one rank does not need a proxy. Let's figure out which one it is */
|
||||
// Which index in the reorganized rings should we compare root against */
|
||||
const int myrank = 0, nextrank = 1, prevrank = nranks-1;
|
||||
int index = pattern == ncclPatternPipelineFrom ?
|
||||
/* no recv / no send if root = */
|
||||
/* bcast */ (type == RECV ? myrank : nextrank ):
|
||||
/* reduce */ (type == RECV ? prevrank : myrank );
|
||||
int rank = ring->userRanks[index];
|
||||
return (root != rank);
|
||||
}
|
||||
|
||||
enum { proxyRecv=0, proxySend=1 };
|
||||
|
||||
#define PROXYARGS_ALLOCATE_SIZE 32
|
||||
struct ncclProxyPool {
|
||||
struct ncclProxyPool *next;
|
||||
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
|
||||
};
|
||||
|
||||
ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
struct ncclProxyArgs* elem;
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
if (state->pool == NULL) {
|
||||
// Allocate a new pool of elements
|
||||
struct ncclProxyPool* newPool;
|
||||
NCCLCHECK(ncclCalloc(&newPool, 1));
|
||||
struct ncclProxyArgs* newElems = newPool->elems;
|
||||
// Chain newly allocated elements
|
||||
for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
|
||||
if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
|
||||
}
|
||||
// Add them all to the pool list
|
||||
state->pool = newElems;
|
||||
// Save the pool memory block for later resource release
|
||||
newPool->next = state->pools;
|
||||
state->pools = newPool;
|
||||
}
|
||||
elem = state->pool;
|
||||
state->pool = state->pool->next;
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
elem->next = elem->nextPeer = NULL;
|
||||
*argsptr = elem;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) {
|
||||
struct ncclComm* comm = connector->comm;
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
if (connector->proxyAppend == NULL) {
|
||||
// Nothing running for that peer. Add to the circular list
|
||||
if (state->ops == NULL) {
|
||||
// Create the list
|
||||
args->next = args;
|
||||
state->ops = args;
|
||||
} else {
|
||||
// Insert element in the list
|
||||
args->next = state->ops->next;
|
||||
state->ops->next = args;
|
||||
}
|
||||
connector->proxyAppend = args;
|
||||
} else {
|
||||
// There is an active operation already for that peer.
|
||||
// Add it to the per-peer list
|
||||
connector->proxyAppend->nextPeer = args;
|
||||
connector->proxyAppend = args;
|
||||
}
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
}
|
||||
|
||||
template <int type>
|
||||
static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
|
||||
if (peer < 0) return ncclSuccess;
|
||||
|
||||
struct ncclPeer* peerComm = args->channel->peers+peer;
|
||||
struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
|
||||
if (connector->transportComm == NULL) return ncclInternalError;
|
||||
if (connector->transportComm->proxy == NULL) return ncclSuccess;
|
||||
|
||||
struct ncclProxyArgs* op;
|
||||
NCCLCHECK(transportAllocateProxyArgs(connector->comm, &op));
|
||||
memcpy(op, args, sizeof(struct ncclProxyArgs));
|
||||
op->connector = connector;
|
||||
op->progress = connector->transportComm->proxy;
|
||||
op->state = ncclProxyOpReady;
|
||||
ProxyAppend(connector, op);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
|
||||
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
|
||||
struct ncclRing* ring = &args->channel->ring;
|
||||
if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args));
|
||||
if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args));
|
||||
}
|
||||
if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
|
||||
// Tree up
|
||||
struct ncclTree* tree = &args->channel->treeUp;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
|
||||
NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
|
||||
}
|
||||
if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
|
||||
// Tree down
|
||||
struct ncclTree* tree = &args->channel->treeDn;
|
||||
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
|
||||
}
|
||||
if (pattern == ncclPatternCollTreeUp) {
|
||||
// CollTree up
|
||||
struct ncclTree* tree = &args->channel->collTreeUp;
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args));
|
||||
NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
|
||||
}
|
||||
if (pattern == ncclPatternCollTreeDown) {
|
||||
// CollTree down
|
||||
struct ncclTree* tree = &args->channel->collTreeDn;
|
||||
NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args));
|
||||
NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void* persistentThread(void *comm_) {
|
||||
struct ncclComm* comm = (struct ncclComm*)comm_;
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
struct ncclProxyArgs* op = NULL;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int idle = 1;
|
||||
int idleSpin = 0;
|
||||
while (1) {
|
||||
do {
|
||||
if (LOAD(comm->abortFlag)) return NULL;
|
||||
if (op == NULL) {
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
op = state->ops;
|
||||
if (op == NULL) {
|
||||
if (state->stop) {
|
||||
// No more commands to process and proxy has been requested to stop
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
return NULL;
|
||||
}
|
||||
pthread_cond_wait(&state->cond, &state->mutex);
|
||||
}
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
}
|
||||
} while (op == NULL);
|
||||
op->idle = 0;
|
||||
// opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
|
||||
// yet and might be cancelled before they even start. Hold on on those.
|
||||
if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
|
||||
if (ret != ncclSuccess) {
|
||||
comm->fatalError = ret;
|
||||
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
|
||||
return NULL;
|
||||
static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
|
||||
for (int t=0; t<NTRANSPORTS; t++) {
|
||||
struct ncclTransport *transport = ncclTransports+t;
|
||||
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
|
||||
int ret = 0;
|
||||
NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
|
||||
if (ret) {
|
||||
connector->transportComm = transportComm;
|
||||
NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, channelId));
|
||||
return ncclSuccess;
|
||||
}
|
||||
idle &= op->idle;
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
if (!idle) idleSpin = 0;
|
||||
struct ncclProxyArgs *next = op->next;
|
||||
if (next->state == ncclProxyOpNone) {
|
||||
struct ncclProxyArgs *freeOp = next;
|
||||
if (next->nextPeer) {
|
||||
// Replace next by its next per-peer element.
|
||||
next = next->nextPeer;
|
||||
if (op != freeOp) {
|
||||
next->next = freeOp->next;
|
||||
op->next = next;
|
||||
} else {
|
||||
next->next = next;
|
||||
}
|
||||
} else {
|
||||
// Remove next from circular list
|
||||
next->connector->proxyAppend = NULL;
|
||||
if (op != freeOp) {
|
||||
next = next->next;
|
||||
op->next = next;
|
||||
} else {
|
||||
next = NULL;
|
||||
}
|
||||
}
|
||||
if (freeOp == state->ops) state->ops = next;
|
||||
freeOp->next = state->pool;
|
||||
state->pool = freeOp;
|
||||
}
|
||||
op = next;
|
||||
if (op == state->ops) {
|
||||
if (idle == 1) {
|
||||
if (++idleSpin == 10) {
|
||||
sched_yield();
|
||||
idleSpin = 0;
|
||||
}
|
||||
}
|
||||
idle = 1;
|
||||
}
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
}
|
||||
WARN("No transport found !");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
ncclResult_t transportStartProxy(struct ncclComm* comm) {
|
||||
pthread_mutex_lock(&comm->proxyState.mutex);
|
||||
if (comm->proxyState.ops != NULL)
|
||||
pthread_cond_signal(&comm->proxyState.cond);
|
||||
pthread_mutex_unlock(&comm->proxyState.mutex);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t transportCreateProxy(struct ncclComm* comm) {
|
||||
if (!comm->proxyThread) {
|
||||
comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
|
||||
comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
comm->proxyState.ops = NULL;
|
||||
pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t transportDestroyProxy(struct ncclComm* comm) {
|
||||
struct ncclProxyState* state = &comm->proxyState;
|
||||
|
||||
// Request the proxy to stop and then wake it
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
state->stop = true;
|
||||
pthread_cond_signal(&state->cond);
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
|
||||
|
||||
// Free off any memory allocated for the proxy arg pools
|
||||
pthread_mutex_lock(&state->mutex);
|
||||
struct ncclProxyState* proxyState = &comm->proxyState;
|
||||
while (proxyState->pools != NULL) {
|
||||
struct ncclProxyPool *next = proxyState->pools->next;
|
||||
free(proxyState->pools);
|
||||
proxyState->pools = next;
|
||||
}
|
||||
pthread_mutex_unlock(&state->mutex);
|
||||
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
|
||||
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
|
||||
uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
|
||||
struct ncclConnect connect;
|
||||
struct ncclConnector* conn;
|
||||
for (int i=0; i<nrecv; i++) {
|
||||
int peer = peerRecv[i];
|
||||
if (peer == -1 || peer >= comm->nRanks) continue;
|
||||
conn = &channel->peers[peer].recv;
|
||||
if (conn->connected) { ++nSkippedRecv; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id));
|
||||
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
}
|
||||
for (int i=0; i<nsend; i++) {
|
||||
int peer = peerSend[i];
|
||||
if (peer == -1 || peer >= comm->nRanks) continue;
|
||||
conn = &channel->peers[peer].send;
|
||||
if (conn->connected) { ++nSkippedSend; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id));
|
||||
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
}
|
||||
for (int i=0; i<nsend; i++) {
|
||||
int peer = peerSend[i];
|
||||
if (peer == -1 || peer >= comm->nRanks) continue;
|
||||
conn = &channel->peers[peer].send;
|
||||
if (conn->connected) {++nSkippedSend; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
CUDACHECK(hipMemcpy(&channel->devPeers[peer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
}
|
||||
for (int i=0; i<nrecv; i++) {
|
||||
int peer = peerRecv[i];
|
||||
if (peer == -1 || peer >= comm->nRanks) continue;
|
||||
conn = &channel->peers[peer].recv;
|
||||
if (conn->connected) {++nSkippedRecv; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
CUDACHECK(hipMemcpy(&channel->devPeers[peer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
}
|
||||
TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+148
-145
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -16,17 +16,10 @@ struct collNetRecvConnectInfo {
|
||||
|
||||
struct collNetSendConnectInfo {
|
||||
void* collNetComm;
|
||||
void* mhandle;
|
||||
void* llMhandle;
|
||||
void* mhandles[NCCL_NUM_PROTOCOLS];
|
||||
struct reqSlot* reqFifo;
|
||||
};
|
||||
|
||||
struct ncclLLDataLine {
|
||||
uint32_t data1;
|
||||
uint32_t data2;
|
||||
};
|
||||
static_assert(sizeof(struct ncclLLDataLine) == sizeof(union ncclLLFifoLine)>>1, "ncclLLDataLine is not half size of ncclLLFifoLine");
|
||||
|
||||
struct reqSlot {
|
||||
volatile void* recvBuff;
|
||||
volatile int size;
|
||||
@@ -38,14 +31,11 @@ struct collNetSendResources {
|
||||
struct ncclRecvMem* hostRecvMem;
|
||||
struct ncclSendMem* devHostSendMem;
|
||||
struct ncclRecvMem* devHostRecvMem;
|
||||
struct ncclLLDataLine* llData;
|
||||
uint32_t* llData;
|
||||
int netDev;
|
||||
int useGdr;
|
||||
int buffSize;
|
||||
void* sendMhandle;
|
||||
void* llSendMhandle;
|
||||
void* recvMhandle;
|
||||
void* llRecvMhandle;
|
||||
void* sendMhandles[NCCL_NUM_PROTOCOLS];
|
||||
void* recvMhandles[NCCL_NUM_PROTOCOLS];
|
||||
struct ncclRecvMem* devRecvMem;
|
||||
uint64_t step;
|
||||
uint64_t llLastCleaning;
|
||||
@@ -60,12 +50,10 @@ struct collNetRecvResources {
|
||||
struct ncclRecvMem* hostRecvMem;
|
||||
struct ncclSendMem* devHostSendMem;
|
||||
struct ncclRecvMem* devHostRecvMem;
|
||||
struct ncclLLDataLine* llData;
|
||||
uint32_t* llData;
|
||||
int netDev;
|
||||
int useGdr;
|
||||
int buffSize;
|
||||
void* mhandle;
|
||||
void* llMhandle;
|
||||
void* mhandles[NCCL_NUM_PROTOCOLS];
|
||||
struct ncclRecvMem* devRecvMem;
|
||||
uint64_t step;
|
||||
uint64_t llLastCleaning;
|
||||
@@ -80,112 +68,120 @@ ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncc
|
||||
}
|
||||
|
||||
/* Setup send connector, and return connect information for others in the coll communicator to connect to me */
|
||||
ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
|
||||
struct collNetSendResources* sendResources;
|
||||
NCCLCHECK(ncclCalloc(&sendResources, 1));
|
||||
send->transportResources = sendResources;
|
||||
ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
|
||||
struct collNetSendResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
send->transportResources = resources;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &sendResources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, sendResources->netDev, 1, &sendResources->useGdr));
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
|
||||
|
||||
int sendSize = sizeof(struct ncclSendMem);
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostSendMem, (void**)&sendResources->devHostSendMem, sendSize));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1));
|
||||
resources->devHostSendMem = resources->hostSendMem;
|
||||
|
||||
int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
|
||||
if (sendResources->useGdr) {
|
||||
NCCLCHECK(ncclCudaCalloc((char**)(&sendResources->devRecvMem), recvSize, true));
|
||||
int recvSize = offsetof(struct ncclRecvMem, buff);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += send->comm->buffSizes[p];
|
||||
|
||||
if (resources->useGdr) {
|
||||
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostRecvMem, (void**)&sendResources->devHostRecvMem, recvSize));
|
||||
NCCLCHECK(ncclIbMalloc((void**)&(sendResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine)));
|
||||
sendResources->buffSize = buffSize;
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), sendResources->netDev,
|
||||
sendResources->useGdr ? "/GDRDMA" : "");
|
||||
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize));
|
||||
resources->devHostRecvMem = resources->hostRecvMem;
|
||||
NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), send->comm->buffSizes[NCCL_PROTO_LL]/2));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Setup recv connector */
|
||||
ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
|
||||
struct collNetRecvResources* recvResources;
|
||||
NCCLCHECK(ncclCalloc(&recvResources, 1));
|
||||
recv->transportResources = recvResources;
|
||||
ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
|
||||
struct collNetRecvResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &recvResources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, recvResources->netDev, 0, &recvResources->useGdr));
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
|
||||
|
||||
int sendSize = sizeof(struct ncclSendMem);
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostSendMem, (void**)&recvResources->devHostSendMem, sendSize));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1));
|
||||
resources->devHostSendMem = resources->hostSendMem;
|
||||
|
||||
int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
|
||||
if (recvResources->useGdr) {
|
||||
NCCLCHECK(ncclCudaCalloc((char**)(&recvResources->devRecvMem), recvSize, true));
|
||||
int recvSize = offsetof(struct ncclRecvMem, buff);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += recv->comm->buffSizes[p];
|
||||
|
||||
if (resources->useGdr) {
|
||||
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostRecvMem, (void**)&recvResources->devHostRecvMem, recvSize));
|
||||
NCCLCHECK(ncclIbMalloc((void**)&(recvResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine)));
|
||||
recvResources->buffSize = buffSize;
|
||||
NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize));
|
||||
resources->devHostRecvMem = resources->hostRecvMem;
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), recvResources->netDev,
|
||||
recvResources->useGdr ? "/GDRDMA" : "");
|
||||
NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), recv->comm->buffSizes[NCCL_PROTO_LL]/2));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "");
|
||||
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
|
||||
NCCLCHECK(collNetListen(recvResources->netDev, &info->collNetHandle, &recvResources->netListenComm));
|
||||
|
||||
NCCLCHECK(collNetListen(resources->netDev, &info->collNetHandle, &resources->netListenComm));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
|
||||
// Setup device pointers
|
||||
struct collNetSendResources* sendResources = (struct collNetSendResources*)send->transportResources;
|
||||
sendResources->collNetRank = rank;
|
||||
|
||||
// Get info from recv side
|
||||
struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank);
|
||||
sendResources->reqFifo = sInfo->reqFifo;
|
||||
sendResources->collNetSendComm = sInfo->collNetComm;
|
||||
sendResources->recvMhandle = sInfo->mhandle;
|
||||
sendResources->llRecvMhandle = sInfo->llMhandle;
|
||||
struct collNetSendResources* resources = (struct collNetSendResources*)send->transportResources;
|
||||
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
|
||||
|
||||
// Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
|
||||
struct ncclRecvMem* sRecvMem = sendResources->useGdr ? sendResources->devRecvMem : sendResources->devHostRecvMem;
|
||||
// Register buffers
|
||||
NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sRecvMem->buff, sendResources->buffSize,
|
||||
sendResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &sendResources->sendMhandle));
|
||||
NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sendResources->llData,
|
||||
NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &sendResources->llSendMhandle));
|
||||
|
||||
send->conn.buff = sRecvMem->buff;
|
||||
send->conn.llBuff = sendResources->devHostRecvMem->llBuff;
|
||||
send->conn.direct |= sendResources->useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
|
||||
int offset = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
send->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset;
|
||||
offset += send->comm->buffSizes[p];
|
||||
}
|
||||
send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
|
||||
// Head/Tail/Opcount/Fifos are always on host
|
||||
send->conn.tail = &sendResources->devHostRecvMem->tail;
|
||||
send->conn.opCountRem = &sendResources->devHostRecvMem->opCount;
|
||||
send->conn.fifo = sendResources->devHostRecvMem->sizesFifo;
|
||||
send->conn.head = &sendResources->devHostSendMem->head;
|
||||
send->conn.opCountLoc = &sendResources->devHostSendMem->opCount;
|
||||
send->conn.tail = &resources->devHostRecvMem->tail;
|
||||
send->conn.opCountRem = &resources->devHostRecvMem->opCount;
|
||||
send->conn.fifo = resources->devHostRecvMem->sizesFifo;
|
||||
send->conn.head = &resources->devHostSendMem->head;
|
||||
send->conn.opCountLoc = &resources->devHostSendMem->opCount;
|
||||
for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
|
||||
|
||||
// Get info from recv side
|
||||
resources->collNetRank = rank;
|
||||
resources->reqFifo = info->reqFifo;
|
||||
resources->collNetSendComm = info->collNetComm;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
|
||||
resources->recvMhandles[p] = info->mhandles[p];
|
||||
|
||||
// Register buffers
|
||||
NCCLCHECK(collNetRegMr(resources->collNetSendComm, send->conn.buffs[NCCL_PROTO_SIMPLE], send->comm->buffSizes[NCCL_PROTO_SIMPLE],
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
|
||||
NCCLCHECK(collNetRegMr(resources->collNetSendComm, resources->llData, send->comm->buffSizes[NCCL_PROTO_LL]/2,
|
||||
NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_LL]));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
|
||||
// Setup device pointers
|
||||
struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recv->transportResources;
|
||||
struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank);
|
||||
recvResources->collNetRank = rank;
|
||||
struct collNetRecvResources* resources = (struct collNetRecvResources*)recv->transportResources;
|
||||
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
|
||||
resources->collNetRank = rank;
|
||||
|
||||
// Intermediate buffering on GPU for GPU Direct RDMA
|
||||
struct ncclRecvMem* rRecvMem = recvResources->useGdr ? recvResources->devRecvMem : recvResources->devHostRecvMem;
|
||||
recv->conn.buff = rRecvMem->buff;
|
||||
recv->conn.llBuff = recvResources->devHostRecvMem->llBuff; // recv LL buff always on host
|
||||
recv->conn.direct |= recvResources->useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
|
||||
int offset = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset;
|
||||
offset += recv->comm->buffSizes[p];
|
||||
}
|
||||
recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
|
||||
// Head/Tail/Opcount are always on host
|
||||
recv->conn.tail = &recvResources->devHostRecvMem->tail;
|
||||
recv->conn.opCountLoc = &recvResources->devHostRecvMem->opCount;
|
||||
recv->conn.head = &recvResources->devHostSendMem->head;
|
||||
recv->conn.opCountRem = &recvResources->devHostSendMem->opCount;
|
||||
recv->conn.tail = &resources->devHostRecvMem->tail;
|
||||
recv->conn.opCountLoc = &resources->devHostRecvMem->opCount;
|
||||
recv->conn.head = &resources->devHostSendMem->head;
|
||||
recv->conn.opCountRem = &resources->devHostSendMem->opCount;
|
||||
|
||||
// Connect to coll comm
|
||||
collNetHandle_t** handlePtrs = NULL;
|
||||
@@ -195,64 +191,64 @@ ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, in
|
||||
handlePtrs[i] = &(info->collNetHandle);
|
||||
}
|
||||
ncclResult_t res;
|
||||
NCCLCHECKGOTO(collNetConnect((void**)handlePtrs, nranks, rank, recvResources->netListenComm, &recvResources->collNetRecvComm), res, cleanup);
|
||||
NCCLCHECKGOTO(collNetConnect((void**)handlePtrs, nranks, rank, resources->netListenComm, &resources->collNetRecvComm), res, cleanup);
|
||||
|
||||
// Register buffers
|
||||
NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, rRecvMem->buff, recvResources->buffSize,
|
||||
recvResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &recvResources->mhandle));
|
||||
NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, recvResources->llData,
|
||||
NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &recvResources->llMhandle));
|
||||
NCCLCHECK(collNetRegMr(resources->collNetRecvComm, recv->conn.buffs[NCCL_PROTO_SIMPLE], recv->comm->buffSizes[NCCL_PROTO_SIMPLE],
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_SIMPLE]));
|
||||
NCCLCHECK(collNetRegMr(resources->collNetRecvComm, resources->llData, recv->comm->buffSizes[NCCL_PROTO_LL]/2,
|
||||
NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_LL]));
|
||||
|
||||
// Create shared info between send and recv proxies
|
||||
NCCLCHECK(ncclCalloc(&(recvResources->reqFifo), NCCL_STEPS));
|
||||
NCCLCHECK(ncclCalloc(&(resources->reqFifo), NCCL_STEPS));
|
||||
|
||||
// Pass info to send side
|
||||
sInfo->reqFifo = recvResources->reqFifo;
|
||||
sInfo->collNetComm = recvResources->collNetRecvComm;
|
||||
sInfo->mhandle = recvResources->mhandle;
|
||||
sInfo->llMhandle = recvResources->llMhandle;
|
||||
info->reqFifo = resources->reqFifo;
|
||||
info->collNetComm = resources->collNetRecvComm;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
|
||||
info->mhandles[p] = resources->mhandles[p];
|
||||
|
||||
cleanup:
|
||||
if (handlePtrs != NULL) free(handlePtrs);
|
||||
// Close listen comm
|
||||
NCCLCHECK(collNetCloseListen(recvResources->netListenComm));
|
||||
NCCLCHECK(collNetCloseListen(resources->netListenComm));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ncclResult_t collNetSendFree(void* sendTransportResources) {
|
||||
struct collNetSendResources* sendResources = (struct collNetSendResources*)sendTransportResources;
|
||||
NCCLCHECK(ncclCudaHostFree(sendResources->hostSendMem));
|
||||
NCCLCHECK(ncclCudaHostFree(sendResources->hostRecvMem));
|
||||
if (sendResources->collNetSendComm) {
|
||||
NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->sendMhandle));
|
||||
NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->llSendMhandle));
|
||||
struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources;
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
|
||||
if (resources->collNetSendComm) {
|
||||
NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_LL]));
|
||||
NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_SIMPLE]));
|
||||
}
|
||||
if (sendResources->useGdr)
|
||||
CUDACHECK(hipFree(sendResources->devRecvMem));
|
||||
free(sendResources->llData);
|
||||
free(sendResources);
|
||||
if (resources->useGdr)
|
||||
CUDACHECK(hipFree(resources->devRecvMem));
|
||||
free(resources->llData);
|
||||
free(resources);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t collNetRecvFree(void* recvTransportResources) {
|
||||
struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recvTransportResources;
|
||||
NCCLCHECK(ncclCudaHostFree(recvResources->hostSendMem));
|
||||
if (recvResources->collNetRecvComm) {
|
||||
NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->mhandle));
|
||||
NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->llMhandle));
|
||||
struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources;
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
|
||||
if (resources->collNetRecvComm) {
|
||||
NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_LL]));
|
||||
NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_SIMPLE]));
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostFree(recvResources->hostRecvMem));
|
||||
if (recvResources->useGdr)
|
||||
CUDACHECK(hipFree(recvResources->devRecvMem));
|
||||
free(recvResources->llData);
|
||||
free(recvResources->reqFifo);
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
|
||||
if (resources->useGdr)
|
||||
CUDACHECK(hipFree(resources->devRecvMem));
|
||||
free(resources->llData);
|
||||
free(resources->reqFifo);
|
||||
|
||||
// Make sure SendFree is called before RecvFree
|
||||
if (recvResources->collNetRecvComm) {
|
||||
NCCLCHECK(collNetCloseColl(recvResources->collNetRecvComm));
|
||||
if (resources->collNetRecvComm) {
|
||||
NCCLCHECK(collNetCloseColl(resources->collNetRecvComm));
|
||||
}
|
||||
free(recvResources);
|
||||
free(resources);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -274,6 +270,11 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = args->protocol;
|
||||
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = args->connector->conn.buffs[p];
|
||||
void* sendMhandle = resources->sendMhandles[p];
|
||||
void* recvMhandle = resources->recvMhandles[p];
|
||||
args->idle = 1;
|
||||
struct reqSlot* reqFifo = resources->reqFifo;
|
||||
if (args->head < args->end) {
|
||||
@@ -287,7 +288,7 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
|
||||
if (size != -1) {
|
||||
uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
|
||||
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
|
||||
union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
|
||||
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
|
||||
int ready = 1;
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
volatile uint32_t *f1 = &lines[i].flag1;
|
||||
@@ -295,16 +296,17 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
|
||||
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
|
||||
}
|
||||
if (ready) {
|
||||
int stepLines = stepSize / sizeof(union ncclLLFifoLine);
|
||||
//separate data from flag
|
||||
struct ncclLLDataLine* sendBuff = resources->llData+buffSlot*NCCL_LL_SLICE_LINES;
|
||||
uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines; // each line has two data elements
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
volatile uint32_t *d1 = &lines[i].data1;
|
||||
volatile uint32_t *d2 = &lines[i].data2;
|
||||
sendBuff[i].data1 = LOAD(d1);
|
||||
sendBuff[i].data2 = LOAD(d2);
|
||||
sendBuff[2*i] = LOAD(d1);
|
||||
sendBuff[2*i+1] = LOAD(d2);
|
||||
}
|
||||
int count = nFifoLines*sizeof(struct ncclLLDataLine) / ncclTypeSize(args->dtype);
|
||||
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->llSendMhandle, resources->llRecvMhandle, args->requests+buffSlot));
|
||||
int count = nFifoLines*2*sizeof(uint32_t) / ncclTypeSize(args->dtype);
|
||||
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce (LL) posted, req %p", args->head, buffSlot, args->requests[buffSlot]);
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
@@ -316,12 +318,10 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
|
||||
}
|
||||
}
|
||||
} else if (args->tail < LOAD(recvTail)) {
|
||||
int stepSize = args->channel->buffSize/NCCL_STEPS;
|
||||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||
// Send through network
|
||||
if (LOAD(sizesFifo+buffSlot) != -1) {
|
||||
int count = LOAD(sizesFifo+buffSlot)/ncclTypeSize(args->dtype);
|
||||
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localMem->buff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->sendMhandle, resources->recvMhandle, args->requests+buffSlot));
|
||||
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localBuff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p count %d", args->head, buffSlot, args->requests[buffSlot], count);
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
@@ -378,16 +378,18 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
|
||||
}
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
args->idle = 1;
|
||||
int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine) : args->channel->buffSize ) / NCCL_STEPS;
|
||||
int p = args->protocol;
|
||||
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = args->connector->conn.buffs[p];
|
||||
void* mhandle = resources->mhandles[p];
|
||||
struct reqSlot* reqFifo = resources->reqFifo;
|
||||
if (args->head < args->end) {
|
||||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||
char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)resources->llData : localMem->buff;
|
||||
void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : resources->mhandle;
|
||||
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
reqFifo[buffSlot].recvBuff = localBuff+buffSlot*stepSize;
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, localBuff+buffSlot*stepSize);
|
||||
char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
|
||||
int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
|
||||
reqFifo[buffSlot].recvBuff = recvBuff+buffSlot*recvStepSize;
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, reqFifo[buffSlot].recvBuff);
|
||||
args->tail += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
}
|
||||
@@ -399,16 +401,17 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
|
||||
if (args->protocol == NCCL_PROTO_LL) { // ll
|
||||
// re-attach flag
|
||||
uint32_t flag = args->head;
|
||||
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(resources->hostRecvMem->llBuff)+buffSlot*NCCL_LL_SLICE_LINES;
|
||||
struct ncclLLDataLine* recvData = resources->llData+buffSlot*NCCL_LL_SLICE_LINES;
|
||||
int nFifoLines = DIVUP(reqFifo[buffSlot].size, sizeof(struct ncclLLDataLine));
|
||||
int stepLines = stepSize / sizeof(union ncclLLFifoLine);
|
||||
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
|
||||
uint32_t* recvData = resources->llData+buffSlot*2*stepLines;
|
||||
int nFifoLines = DIVUP(reqFifo[buffSlot].size, 2*sizeof(uint32_t));
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
lines[i].v[0] = ((uint64_t)flag << 32) + recvData[i].data1;
|
||||
lines[i].v[1] = ((uint64_t)flag << 32) + recvData[i].data2;
|
||||
lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
|
||||
lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
|
||||
}
|
||||
} else if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
if (resources->useGdr) collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle);
|
||||
STORE(&resources->hostRecvMem->tail, args->head);
|
||||
if (resources->useGdr) NCCLCHECK(collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle));
|
||||
resources->hostRecvMem->tail = args->head;
|
||||
}
|
||||
args->idle = 0;
|
||||
}
|
||||
|
||||
+142
-120
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -13,19 +13,20 @@ struct netConnectInfo {
|
||||
ncclNetHandle_t netHandle;
|
||||
};
|
||||
|
||||
#define LOC_HOSTMEM 0
|
||||
#define LOC_DEVMEM 1
|
||||
#define LOC_COUNT 2
|
||||
|
||||
struct netSendResources {
|
||||
void* netSendComm;
|
||||
struct ncclSendMem* hostSendMem;
|
||||
struct ncclRecvMem* hostRecvMem;
|
||||
struct ncclSendMem* devHostSendMem;
|
||||
struct ncclRecvMem* devHostRecvMem;
|
||||
struct ncclSendMem* sendMem;
|
||||
struct ncclRecvMem* recvMem;
|
||||
int netDev;
|
||||
int useGdr;
|
||||
int buffSize;
|
||||
void* mhandle;
|
||||
void* llMhandle;
|
||||
void* ll128Mhandle;
|
||||
struct ncclRecvMem* devRecvMem;
|
||||
char* buffers[LOC_COUNT];
|
||||
int buffSizes[LOC_COUNT];
|
||||
void* mhandles[LOC_COUNT];
|
||||
void** mhandlesProto[NCCL_NUM_PROTOCOLS];
|
||||
uint64_t step;
|
||||
uint64_t llLastCleaning;
|
||||
};
|
||||
@@ -33,17 +34,14 @@ struct netSendResources {
|
||||
struct netRecvResources {
|
||||
void* netListenComm;
|
||||
void* netRecvComm;
|
||||
struct ncclSendMem* hostSendMem;
|
||||
struct ncclRecvMem* hostRecvMem;
|
||||
struct ncclSendMem* devHostSendMem;
|
||||
struct ncclRecvMem* devHostRecvMem;
|
||||
struct ncclSendMem* sendMem;
|
||||
struct ncclRecvMem* recvMem;
|
||||
int netDev;
|
||||
int useGdr;
|
||||
int buffSize;
|
||||
void* mhandle;
|
||||
void* llMhandle;
|
||||
void* ll128Mhandle;
|
||||
struct ncclRecvMem* devRecvMem;
|
||||
char* buffers[LOC_COUNT];
|
||||
int buffSizes[LOC_COUNT];
|
||||
void* mhandles[LOC_COUNT];
|
||||
void** mhandlesProto[NCCL_NUM_PROTOCOLS];
|
||||
uint64_t step;
|
||||
uint64_t llLastCleaning;
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
@@ -57,84 +55,123 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
|
||||
/* Determine if we will use this transport for this peer and return connect
|
||||
* information for this peer */
|
||||
ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
|
||||
ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
|
||||
struct netSendResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
send->transportResources = resources;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
|
||||
|
||||
int sendSize = sizeof(struct ncclSendMem);
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
|
||||
|
||||
int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
|
||||
if (resources->useGdr) {
|
||||
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, true));
|
||||
send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
send->conn.tail = &resources->recvMem->tail;
|
||||
send->conn.opCountRem = &resources->recvMem->opCount;
|
||||
send->conn.fifo = resources->recvMem->sizesFifo;
|
||||
send->conn.head = &resources->sendMem->head;
|
||||
send->conn.opCountLoc = &resources->sendMem->opCount;
|
||||
for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
|
||||
|
||||
int protoLoc[NCCL_NUM_PROTOCOLS];
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
|
||||
resources->buffSize = buffSize;
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
// Only allocate buffers for simple for p2p connections
|
||||
buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : send->comm->buffSizes[p];
|
||||
resources->buffSizes[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
|
||||
if (resources->buffSizes[LOC_DEVMEM]) {
|
||||
NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
|
||||
}
|
||||
if (resources->buffSizes[LOC_HOSTMEM]) {
|
||||
NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
|
||||
}
|
||||
|
||||
int offsets[LOC_COUNT];
|
||||
offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
|
||||
send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
|
||||
offsets[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
|
||||
ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
|
||||
struct netRecvResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
|
||||
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
|
||||
|
||||
int sendSize = sizeof(struct ncclSendMem);
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
|
||||
|
||||
int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
|
||||
if (resources->useGdr) {
|
||||
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, true));
|
||||
recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
recv->conn.tail = &resources->recvMem->tail;
|
||||
recv->conn.opCountLoc = &resources->recvMem->opCount;
|
||||
recv->conn.head = &resources->sendMem->head;
|
||||
recv->conn.opCountRem = &resources->sendMem->opCount;
|
||||
|
||||
int protoLoc[NCCL_NUM_PROTOCOLS];
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
|
||||
resources->buffSize = buffSize;
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
// Only allocate buffers for simple for p2p connections
|
||||
buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : recv->comm->buffSizes[p];
|
||||
resources->buffSizes[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
|
||||
if (resources->buffSizes[LOC_DEVMEM]) {
|
||||
NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr));
|
||||
}
|
||||
if (resources->buffSizes[LOC_HOSTMEM]) {
|
||||
NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
|
||||
}
|
||||
|
||||
int offsets[LOC_COUNT];
|
||||
offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
|
||||
recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
|
||||
offsets[protoLoc[p]] += buffSizes[p];
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
|
||||
resources->useGdr ? "/GDRDMA" : "");
|
||||
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
|
||||
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
|
||||
// Setup device pointers
|
||||
struct netSendResources* resources = (struct netSendResources*)send->transportResources;
|
||||
|
||||
// Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
|
||||
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
|
||||
send->conn.buff = recvMem->buff;
|
||||
send->conn.llBuff = resources->devHostRecvMem->llBuff;
|
||||
send->conn.ll128Buff = recvMem->ll128Buff;
|
||||
send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
|
||||
// Head/Tail/Opcount/Fifos are always on host
|
||||
send->conn.tail = &resources->devHostRecvMem->tail;
|
||||
send->conn.opCountRem = &resources->devHostRecvMem->opCount;
|
||||
send->conn.fifo = resources->devHostRecvMem->sizesFifo;
|
||||
send->conn.head = &resources->devHostSendMem->head;
|
||||
send->conn.opCountLoc = &resources->devHostSendMem->opCount;
|
||||
for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
|
||||
struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
|
||||
|
||||
// Connect to remote peer
|
||||
struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
|
||||
NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
|
||||
|
||||
NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->buff, resources->buffSize,
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
|
||||
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff,
|
||||
NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle));
|
||||
NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));
|
||||
|
||||
if (resources->buffSizes[LOC_DEVMEM]) {
|
||||
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
|
||||
}
|
||||
if (resources->buffSizes[LOC_HOSTMEM]) {
|
||||
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -143,42 +180,29 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
|
||||
// Setup device pointers
|
||||
struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
|
||||
|
||||
// Intermediate buffering on GPU for GPU Direct RDMA
|
||||
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
|
||||
recv->conn.buff = recvMem->buff;
|
||||
recv->conn.llBuff = recvMem->llBuff;
|
||||
recv->conn.ll128Buff = recvMem->ll128Buff;
|
||||
recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
|
||||
// Head/Tail/Opcount are always on host
|
||||
recv->conn.tail = &resources->devHostRecvMem->tail;
|
||||
recv->conn.opCountLoc = &resources->devHostRecvMem->opCount;
|
||||
recv->conn.head = &resources->devHostSendMem->head;
|
||||
recv->conn.opCountRem = &resources->devHostSendMem->opCount;
|
||||
|
||||
// Finish connection establishment from remote peer
|
||||
NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
|
||||
NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
|
||||
|
||||
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->buff, resources->buffSize,
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
|
||||
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE,
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle));
|
||||
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));
|
||||
|
||||
if (resources->buffSizes[LOC_DEVMEM]) {
|
||||
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
|
||||
}
|
||||
if (resources->buffSizes[LOC_HOSTMEM]) {
|
||||
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t netSendFree(void* transportResources) {
|
||||
struct netSendResources* resources = (struct netSendResources*)transportResources;
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle));
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->ll128Mhandle));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
|
||||
if (resources->useGdr)
|
||||
CUDACHECK(hipFree(resources->devRecvMem));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->sendMem));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->recvMem));
|
||||
for (int l=0; l<LOC_COUNT; l++) {
|
||||
if (resources->buffers[l])
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l]));
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
|
||||
CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
|
||||
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
|
||||
free(resources);
|
||||
return ncclSuccess;
|
||||
@@ -186,13 +210,14 @@ ncclResult_t netSendFree(void* transportResources) {
|
||||
|
||||
ncclResult_t netRecvFree(void* transportResources) {
|
||||
struct netRecvResources* resources = (struct netRecvResources*)transportResources;
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle));
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->ll128Mhandle));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
|
||||
if (resources->useGdr)
|
||||
CUDACHECK(hipFree(resources->devRecvMem));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->sendMem));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->recvMem));
|
||||
for (int l=0; l<LOC_COUNT; l++) {
|
||||
if (resources->buffers[l])
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[l]));
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
|
||||
CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
|
||||
NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
|
||||
free(resources);
|
||||
return ncclSuccess;
|
||||
@@ -202,7 +227,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources);
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
// Update opCount
|
||||
STORE(&resources->hostRecvMem->opCount, args->opCount);
|
||||
STORE(&resources->recvMem->opCount, args->opCount);
|
||||
|
||||
// Round to next multiple of sliceSteps
|
||||
resources->step = ROUNDUP(resources->step, args->chunkSteps);
|
||||
@@ -212,18 +237,19 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = args->protocol;
|
||||
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = args->connector->conn.buffs[p];
|
||||
void* mhandle = *(resources->mhandlesProto[p]);
|
||||
args->idle = 1;
|
||||
if (args->head < args->end) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
|
||||
volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
|
||||
volatile int* sizesFifo = resources->recvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->recvMem->tail;
|
||||
if (args->protocol == NCCL_PROTO_LL128) {
|
||||
int stepSize = NCCL_LL128_BUFF_SIZE/NCCL_STEPS;
|
||||
if (args->tail < LOAD(recvTail)) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
if (LOAD(sizesFifo+buffSlot) != -1) {
|
||||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||
char* localBuff = (char*)localMem->ll128Buff;
|
||||
int ready = resources->useGdr;
|
||||
if (!ready) {
|
||||
// When data is in sysmem, we need to wait until all flags are correct since the GPU only
|
||||
@@ -238,7 +264,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
}
|
||||
if (ready) {
|
||||
// Send through network
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, LOAD(sizesFifo+buffSlot), resources->ll128Mhandle, args->requests+buffSlot));
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, LOAD(sizesFifo+buffSlot), mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
@@ -250,13 +276,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
}
|
||||
}
|
||||
} else if (args->protocol == NCCL_PROTO_LL) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
int size = LOAD(sizesFifo+buffSlot);
|
||||
if (size != -1) {
|
||||
uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
|
||||
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
|
||||
size = nFifoLines * sizeof(union ncclLLFifoLine);
|
||||
union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
|
||||
union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
|
||||
int ready = 1;
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
volatile uint32_t *f1 = &lines[i].flag1;
|
||||
@@ -264,7 +289,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
|
||||
}
|
||||
if (ready) {
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, resources->llMhandle, args->requests+buffSlot));
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
@@ -275,12 +300,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
}
|
||||
}
|
||||
} else if (args->tail < LOAD(recvTail)) {
|
||||
int stepSize = args->channel->buffSize/NCCL_STEPS;
|
||||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||
// Send through network
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
if (LOAD(sizesFifo+buffSlot) != -1) {
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, LOAD(sizesFifo+buffSlot), resources->mhandle, args->requests+buffSlot));
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
@@ -297,7 +319,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
|
||||
if (done) {
|
||||
args->head += args->sliceSteps;
|
||||
STORE(&resources->hostSendMem->head, args->head);
|
||||
STORE(&resources->sendMem->head, args->head);
|
||||
args->idle = 0;
|
||||
}
|
||||
}
|
||||
@@ -315,7 +337,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
||||
struct netRecvResources* resources = (struct netRecvResources*) (args->connector->transportResources);
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
// Update opCount
|
||||
STORE(&resources->hostSendMem->opCount, args->opCount);
|
||||
STORE(&resources->sendMem->opCount, args->opCount);
|
||||
|
||||
// Round to next multiple of sliceSteps
|
||||
resources->step = ROUNDUP(resources->step, args->chunkSteps);
|
||||
@@ -326,12 +348,12 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
||||
}
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
args->idle = 1;
|
||||
int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : args->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
|
||||
int p = args->protocol;
|
||||
int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = args->connector->conn.buffs[p];
|
||||
void* mhandle = *(resources->mhandlesProto[p]);
|
||||
if (args->head < args->end) {
|
||||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||
char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)localMem->llBuff : args->protocol == NCCL_PROTO_LL128 ? (char*)localMem->ll128Buff : localMem->buff;
|
||||
void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : args->protocol == NCCL_PROTO_LL128 ? resources->ll128Mhandle : resources->mhandle;
|
||||
volatile uint64_t* sendHead = &resources->hostSendMem->head;
|
||||
volatile uint64_t* sendHead = &resources->sendMem->head;
|
||||
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < LOAD(sendHead) + NCCL_STEPS) && (args->tail < args->end)) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
int sliceSize = stepSize * args->sliceSteps;
|
||||
@@ -348,8 +370,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
||||
if (done) {
|
||||
args->head += args->sliceSteps;
|
||||
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
|
||||
STORE(&resources->hostRecvMem->tail, args->head);
|
||||
if (resources->useGdr) NCCLCHECK(ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle));
|
||||
STORE(&resources->recvMem->tail, args->head);
|
||||
}
|
||||
args->idle = 0;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -113,6 +113,7 @@ static int ncclIbSpeed(int speed) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
static int shownIbHcaEnv = 0;
|
||||
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
|
||||
if (ncclParamIbDisable()) return ncclInternalError;
|
||||
|
||||
@@ -132,6 +133,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
|
||||
// Check if user defined which IB device:port to use
|
||||
char* userIbEnv = getenv("NCCL_IB_HCA");
|
||||
if (userIbEnv != NULL && shownIbHcaEnv++ == 0) INFO(NCCL_NET|NCCL_ENV, "NCCL_IB_HCA set to %s", userIbEnv);
|
||||
struct netIf userIfs[MAX_IB_DEVS];
|
||||
bool searchNot = userIbEnv && userIbEnv[0] == '^';
|
||||
if (searchNot) userIbEnv++;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
|
||||
+63
-20
@@ -16,6 +16,7 @@
|
||||
|
||||
struct p2pConnectInfo {
|
||||
int direct;
|
||||
int read;
|
||||
union {
|
||||
void* directPtr;
|
||||
hipIpcMemHandle_t devIpc;
|
||||
@@ -80,7 +81,8 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
}
|
||||
|
||||
// Check topology / p2p level.
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret));
|
||||
int read;
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, &read));
|
||||
if (*ret == 0) return ncclSuccess;
|
||||
|
||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||
@@ -122,14 +124,32 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
} while (0)
|
||||
|
||||
#define MAX_SHM_NAME_LEN 1024
|
||||
// Setting this to non zero causes P2P to use Reads rather than Writes
|
||||
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
|
||||
|
||||
static int p2pUseRead(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
int readEnable = ncclParamP2pReadEnable();
|
||||
if (readEnable != -2) return readEnable;
|
||||
|
||||
int p2p, read;
|
||||
// Queries the topology to see if the GPUs are Ampere and
|
||||
// connected via NVLink, if so we enable P2P Read by default
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, &read));
|
||||
|
||||
return read;
|
||||
}
|
||||
|
||||
/* Send: Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
|
||||
struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
|
||||
|
||||
struct p2pSendResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
send->transportResources = resources;
|
||||
int useRead = p2pUseRead(topo, myInfo, peerInfo);
|
||||
int sendSize = sizeof(struct ncclSendMem);
|
||||
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
|
||||
if (useRead) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
|
||||
ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
|
||||
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize, true));
|
||||
|
||||
@@ -155,11 +175,14 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
TRACE(NCCL_P2P,"Open shmName %s", shmName);
|
||||
NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->opCount, (void**)&resources->devOpCount, 1));
|
||||
|
||||
info.read = useRead;
|
||||
const char* useReadStr = info.read ? "/read" : "";
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
info.direct = 1;
|
||||
info.directPtr = resources->devMem;
|
||||
if (myInfo->cudaDev == peerInfo->cudaDev) {
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%d] -> %d[%d] via P2P/common device%s",
|
||||
channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr);
|
||||
return ncclInternalError;
|
||||
} else {
|
||||
// Enable P2P access
|
||||
@@ -171,8 +194,8 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
||||
}
|
||||
} else {
|
||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||
@@ -185,8 +208,8 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
myInfo->rank, peerCudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
|
||||
//TRACE_DUMP_IPC(&info.devIpc);
|
||||
}
|
||||
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
|
||||
@@ -196,12 +219,15 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
|
||||
/* Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
|
||||
struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId) {
|
||||
|
||||
struct p2pRecvResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
|
||||
int useRead = p2pUseRead(topo, myInfo, peerInfo);
|
||||
int recvSize = offsetof(struct ncclRecvMem, buff);
|
||||
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(useRead && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
|
||||
ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
|
||||
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize, true));
|
||||
|
||||
@@ -216,6 +242,7 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
TRACE(NCCL_P2P,"Open shmName %s", shmName);
|
||||
NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->opCount, (void**)&resources->devOpCount, 1));
|
||||
|
||||
info.read = useRead;
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
info.direct = 1;
|
||||
info.directPtr = resources->devMem;
|
||||
@@ -231,7 +258,7 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
peerInfo->cudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
}
|
||||
} else {
|
||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||
@@ -244,7 +271,7 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
myInfo->rank, peerCudaDev, peerInfo->busId, err, hipGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
//TRACE_DUMP_IPC(&info.devIpc);
|
||||
}
|
||||
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
|
||||
@@ -259,7 +286,7 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks,
|
||||
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
|
||||
if (info->direct) {
|
||||
remDevMem = (struct ncclRecvMem*)(info->directPtr);
|
||||
send->conn.direct |= NCCL_DIRECT_GPU;
|
||||
if (info->read == 0) send->conn.direct |= NCCL_DIRECT_GPU;
|
||||
} else {
|
||||
//TRACE_DUMP_IPC(&info->devIpc);
|
||||
hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
|
||||
@@ -278,9 +305,16 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks,
|
||||
// Remove the file to ensure proper clean-up
|
||||
NCCLCHECK(shmUnlink(shmName));
|
||||
|
||||
send->conn.buff = remDevMem->buff;
|
||||
send->conn.llBuff = remDevMem->llBuff;
|
||||
send->conn.ll128Buff = remDevMem->ll128Buff;
|
||||
int offset = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (info->read && p == NCCL_PROTO_SIMPLE) {
|
||||
/* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
|
||||
send->conn.buffs[p] = resources->devMem->buff;
|
||||
} else {
|
||||
send->conn.buffs[p] = remDevMem->buff + offset;
|
||||
offset += send->comm->buffSizes[p];
|
||||
}
|
||||
}
|
||||
send->conn.tail = &remDevMem->tail;
|
||||
send->conn.opCountRem = resources->devRemOpCount;
|
||||
send->conn.head = &resources->devMem->head;
|
||||
@@ -297,8 +331,10 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
|
||||
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
|
||||
if (info->direct) {
|
||||
remDevMem = (struct ncclSendMem*)(info->directPtr);
|
||||
recv->conn.direct |= NCCL_DIRECT_GPU;
|
||||
recv->conn.ptrExchange = &remDevMem->ptrExchange;
|
||||
if (info->read == 0) {
|
||||
recv->conn.direct |= NCCL_DIRECT_GPU;
|
||||
recv->conn.ptrExchange = &remDevMem->ptrExchange;
|
||||
}
|
||||
} else {
|
||||
//TRACE_DUMP_IPC(&info->devIpc);
|
||||
hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
|
||||
@@ -316,9 +352,16 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
|
||||
NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->remOpCount, (void**)&resources->devRemOpCount, 0));
|
||||
NCCLCHECK(shmUnlink(shmName));
|
||||
|
||||
recv->conn.buff = resources->devMem->buff;
|
||||
recv->conn.llBuff = resources->devMem->llBuff;
|
||||
recv->conn.ll128Buff = resources->devMem->ll128Buff;
|
||||
int offset = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (info->read && p == NCCL_PROTO_SIMPLE) {
|
||||
/* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */
|
||||
recv->conn.buffs[p] = remDevMem->buff;
|
||||
} else {
|
||||
recv->conn.buffs[p] = resources->devMem->buff + offset;
|
||||
offset += recv->comm->buffSizes[p];
|
||||
}
|
||||
}
|
||||
recv->conn.tail = &resources->devMem->tail;
|
||||
recv->conn.opCountLoc = resources->devOpCount;
|
||||
recv->conn.head = &remDevMem->head;
|
||||
|
||||
+17
-11
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -57,7 +57,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
#define MAX_SHM_NAME_LEN 1024
|
||||
|
||||
/* Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
|
||||
ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
|
||||
|
||||
struct shmSendResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
@@ -75,13 +75,13 @@ ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
|
||||
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
|
||||
memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
|
||||
ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
|
||||
struct shmRecvResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
@@ -94,7 +94,9 @@ ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
|
||||
char shmName[MAX_SHM_NAME_LEN];
|
||||
sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
|
||||
info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize;
|
||||
int shmSize = offsetof(struct ncclRecvMem, buff);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
|
||||
info.shmSize = resources->shmSize = shmSize;
|
||||
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
|
||||
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
|
||||
|
||||
@@ -118,9 +120,11 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
|
||||
NCCLCHECK(shmUnlink(shmName));
|
||||
|
||||
send->transportResources = resources;
|
||||
send->conn.buff = resources->devRemHostMem->buff;
|
||||
send->conn.llBuff = resources->devRemHostMem->llBuff;
|
||||
send->conn.ll128Buff = resources->devRemHostMem->ll128Buff;
|
||||
int offset = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
send->conn.buffs[p] = resources->devRemHostMem->buff + offset;
|
||||
offset += send->comm->buffSizes[p];
|
||||
}
|
||||
send->conn.tail = &resources->devRemHostMem->tail;
|
||||
send->conn.opCountRem = &resources->devRemHostMem->opCount;
|
||||
|
||||
@@ -143,9 +147,11 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
|
||||
recv->conn.head = &resources->devRemHostMem->head;
|
||||
recv->conn.opCountRem = &resources->devRemHostMem->opCount;
|
||||
|
||||
recv->conn.buff = resources->devHostMem->buff;
|
||||
recv->conn.llBuff = resources->devHostMem->llBuff;
|
||||
recv->conn.ll128Buff = resources->devHostMem->ll128Buff;
|
||||
int offset = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
recv->conn.buffs[p] = resources->devHostMem->buff + offset;
|
||||
offset += recv->comm->buffSizes[p];
|
||||
}
|
||||
recv->conn.tail = &resources->devHostMem->tail;
|
||||
recv->conn.opCountLoc = &resources->devHostMem->opCount;
|
||||
return ncclSuccess;
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele