130ee246e2
Change-Id: I92d32b15e516a39710b676aee692ae9b70638937 Reviewed-on: http://git-master/r/935458 Reviewed-by: Przemek Tredak <ptredak@nvidia.com> Tested-by: Przemek Tredak <ptredak@nvidia.com>
497 lines
19 KiB
Plaintext
497 lines
19 KiB
Plaintext
/*************************************************************************
|
|
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
************************************************************************/
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
|
|
#include "core.h"
|
|
#include "common_kernel.h"
|
|
#include "copy_kernel.h"
|
|
#include "enqueue.h"
|
|
|
|
/* HIERARCHY
|
|
*
|
|
* The data is split into CHUNKS, and each CHUNK is split into NUM_SUBCHUNKS
|
|
* SUBCHUNKS, where each SUBCHUNK is processed independently. A SUBCHUNK is
|
|
* split into numUnroll UNROLLS and each thread performs UNROLL_COUNT
|
|
* single-data-element operations inside an UNROLL. As the name suggests, the
|
|
* UNROLL_COUNT operations within an UNROLL are unrolled.
|
|
*/
|
|
|
|
|
|
// Number of threads used to perform copies, etc. Must be multiple of 32.
|
|
// An additional thread is used to handle threadfences, so the CUDA blocks
|
|
// have dimension NUM_THREADS+1.
|
|
#define NUM_THREADS 256
|
|
|
|
// Each thread unrolls the innermost loop of the copy or reduction operations
|
|
// to this many single-data-element instructions
|
|
#define UNROLL_COUNT 8
|
|
|
|
#define UNROLL_SIZE (UNROLL_COUNT * NUM_THREADS)
|
|
|
|
// To hide the latency associated with the synchronization between different
|
|
// subchunks, we interleave the independent subchunks so that more data can be
|
|
// transferred while the sync is in progress. This is the number of subchunks
|
|
// that are active at the same time
|
|
#define NUM_SUBCHUNKS 2
|
|
|
|
// If this is called with STEP, it means that we just finished processing the
|
|
// data for step STEP on this GPU, which is the data required on the next GPU
|
|
// for step STEP + 1, so we signal the next GPU that its data for step STEP + 1
|
|
// is available. This is called by one particular consumer warp and so we select
|
|
// the first thread in the warp to set the flag.
|
|
#define SIGNAL_NEW_DATA_AVAILABLE(chunk, subchunk, step) \
|
|
do { \
|
|
__threadfence_system(); \
|
|
args.NextNewDataAvailableFlag[0] = \
|
|
NUM_SUBCHUNKS*((chunk) * (args.NumGPUs - 1) + (step)) + subchunk+1; \
|
|
} while (0)
|
|
|
|
// This is called by all producer threads, but only thread 0 spins on the flag,
|
|
#define WAIT_FOR_NEW_DATA(chunk, subchunk, step) \
|
|
do { \
|
|
if (tid == 0) { \
|
|
Wait([=] { \
|
|
return ((volatile int *)args.ThisNewDataAvailableFlag)[0] >= \
|
|
NUM_SUBCHUNKS*((chunk) * (args.NumGPUs - 1) + (step)) \
|
|
+ subchunk + 1 - NUM_SUBCHUNKS; \
|
|
}); \
|
|
} \
|
|
BAR(sync, 1, NUM_THREADS); \
|
|
} while (0)
|
|
|
|
#define SIGNAL_CHUNK_DONE(chunk, subchunk) \
|
|
do { \
|
|
__threadfence_system(); \
|
|
args.PrevChunkDoneFlag[0] = NUM_SUBCHUNKS*(chunk) + (subchunk) + 1; \
|
|
} while (0)
|
|
|
|
#define WAIT_FOR_PREV_CHUNK(chunk, subchunk) \
|
|
do { \
|
|
if (tid == 0) { \
|
|
Wait([=] { \
|
|
return ((volatile int*)args.ThisChunkDoneFlag)[0] >= \
|
|
NUM_SUBCHUNKS*(chunk) + subchunk + 1-NUM_SUBCHUNKS; \
|
|
}); \
|
|
} \
|
|
BAR(sync, 1, NUM_THREADS); \
|
|
} while (0)
|
|
|
|
__device__ inline void getSliceSizeAndChunkSize(int *sliceSize, int slice,
|
|
int numSlices, int numBigSlices, int numSmallSlices, int bigSliceN,
|
|
int smallSliceN, int lastSliceN) {
|
|
if (slice < numBigSlices) {
|
|
*sliceSize = bigSliceN;
|
|
} else {
|
|
*sliceSize = (slice < numBigSlices + numSmallSlices) ? smallSliceN
|
|
: ((slice == numSlices - 1) ? lastSliceN : 0);
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
struct AllGatherKernelArgs {
|
|
// general parameters
|
|
int ThisId;
|
|
int NumGPUs;
|
|
int N;
|
|
int * UserFromRing;
|
|
|
|
// some pre-computed sizes
|
|
int SliceSize;
|
|
int ChunkSize;
|
|
int NumChunks;
|
|
|
|
int BufferSliceStride;
|
|
int BufferMisalignedN;
|
|
|
|
T ** ThisPtrToNextOutput;
|
|
T ** PrevPtrToThisOutput;
|
|
|
|
// local and remote input, output, and buffer
|
|
const T * __restrict__ ThisInput;
|
|
volatile T * __restrict__ ThisOutput;
|
|
volatile T * __restrict__ ThisBuffer;
|
|
volatile T * __restrict__ NextBuffer;
|
|
|
|
// local and remote flags
|
|
volatile int * __restrict__ ThisNewDataAvailableFlag;
|
|
volatile int * __restrict__ NextNewDataAvailableFlag;
|
|
volatile int * __restrict__ ThisChunkDoneFlag;
|
|
volatile int * __restrict__ PrevChunkDoneFlag;
|
|
};
|
|
|
|
__device__ inline int GetBlock(const int index, const int step,
|
|
const int * const userFromRing, const int numGPUs) {
|
|
return userFromRing[(numGPUs + index - step) % numGPUs];
|
|
}
|
|
|
|
__shared__ volatile void * nextOutput;
|
|
|
|
template<int THREADS, int UNROLL, bool PUSHRECV, typename T>
|
|
__global__ void AllGatherKernel(const AllGatherKernelArgs<T> args) {
|
|
if (args.N == 0) return;
|
|
int tid = threadIdx.x;
|
|
|
|
// First wait for args.PrevPtrToThisOutput to become nullptr to ensure that
|
|
// the previous GPU is done with a previous collective operation.
|
|
if (tid == 0) {
|
|
Wait([=] {
|
|
return *((T * volatile *)args.PrevPtrToThisOutput) == nullptr;
|
|
});
|
|
|
|
*((T * volatile *)args.PrevPtrToThisOutput) = (T*)args.ThisOutput;
|
|
|
|
Wait([=] {
|
|
return *((T * volatile *)args.ThisPtrToNextOutput) != nullptr;
|
|
});
|
|
|
|
if(PUSHRECV)
|
|
nextOutput = *((volatile void * volatile *)args.ThisPtrToNextOutput);
|
|
}
|
|
__syncthreads();
|
|
|
|
for (int chunk = 0; chunk < args.NumChunks; ++chunk) {
|
|
// calculate slice size. for all chunks except (possibly) the last one,
|
|
// this will just be args.SliceSize. For the last one, it may be smaller
|
|
int bigSliceN = args.SliceSize;
|
|
int smallSliceN = 0;
|
|
int lastSliceN = 0;
|
|
int numSlices = NUM_SUBCHUNKS;
|
|
int numBigSlices = numSlices;
|
|
int numSmallSlices = 0;
|
|
|
|
// last chunk
|
|
if ((chunk + 1 == args.NumChunks) && (args.N % args.ChunkSize > 0))
|
|
CalcLastChunk<THREADS, UNROLL, T>(&bigSliceN, &smallSliceN, &lastSliceN,
|
|
&numSlices, &numBigSlices, &numSmallSlices, args.N, args.NumChunks,
|
|
args.ChunkSize);
|
|
|
|
// this offset is only applied to Data pointers, not to Buffer pointers,
|
|
// since we only have one buffer per chunk
|
|
int chunkOffset = chunk * args.ChunkSize;
|
|
|
|
// step 0: copy the resident block from the ThisInput to ThisOutput and also
|
|
// to NextOutput
|
|
int step = 0;
|
|
int block = GetBlock(args.ThisId, step, args.UserFromRing, args.NumGPUs);
|
|
int outputOffset = chunkOffset + block * args.N;
|
|
int inputOffset = chunkOffset;
|
|
int bufferOffset;
|
|
int sliceSize;
|
|
|
|
if (!PUSHRECV) {
|
|
bufferOffset = block * NUM_SUBCHUNKS * args.BufferSliceStride +
|
|
block * args.BufferMisalignedN;
|
|
}
|
|
|
|
// Copy from ThisInput
|
|
if (tid < THREADS) {
|
|
for(int s=0; s<NUM_SUBCHUNKS; ++s) {
|
|
getSliceSizeAndChunkSize(&sliceSize, s, numSlices, numBigSlices,
|
|
numSmallSlices, bigSliceN, smallSliceN, lastSliceN);
|
|
|
|
if (!PUSHRECV)
|
|
WAIT_FOR_PREV_CHUNK(chunk, s);
|
|
|
|
if (PUSHRECV) {
|
|
DoubleCopy<UNROLL, THREADS>(
|
|
args.ThisOutput + outputOffset,
|
|
(volatile T *)nextOutput + outputOffset,
|
|
args.ThisInput + inputOffset,
|
|
sliceSize);
|
|
} else {
|
|
DoubleCopy<UNROLL, THREADS>(
|
|
args.ThisOutput + outputOffset,
|
|
args.NextBuffer + bufferOffset,
|
|
args.ThisInput + inputOffset,
|
|
sliceSize);
|
|
}
|
|
__syncthreads();
|
|
|
|
outputOffset += sliceSize;
|
|
inputOffset += sliceSize;
|
|
if (!PUSHRECV)
|
|
bufferOffset += sliceSize;
|
|
}
|
|
} else {
|
|
for(int s=0; s<NUM_SUBCHUNKS; ++s) {
|
|
__syncthreads();
|
|
SIGNAL_NEW_DATA_AVAILABLE(chunk, s, step);
|
|
}
|
|
}
|
|
|
|
// steps j with 0 < j < k - 1:
|
|
// copy a block that was pushed to this GPU to the next GPU
|
|
for (step = 1; step < args.NumGPUs - 1; ++step) {
|
|
block = GetBlock(args.ThisId, step, args.UserFromRing, args.NumGPUs);
|
|
outputOffset = chunkOffset + block * args.N;
|
|
if (!PUSHRECV) {
|
|
bufferOffset = block * NUM_SUBCHUNKS * args.BufferSliceStride +
|
|
block * args.BufferMisalignedN;
|
|
}
|
|
|
|
if (tid < THREADS) {
|
|
for(int s=0; s<NUM_SUBCHUNKS; ++s) {
|
|
getSliceSizeAndChunkSize(&sliceSize, s, numSlices, numBigSlices,
|
|
numSmallSlices, bigSliceN, smallSliceN, lastSliceN);
|
|
WAIT_FOR_NEW_DATA(chunk, s, step);
|
|
|
|
if (PUSHRECV) {
|
|
Copy<UNROLL, THREADS>(
|
|
(volatile T *)nextOutput + outputOffset,
|
|
args.ThisOutput + outputOffset,
|
|
sliceSize);
|
|
} else {
|
|
DoubleCopy<UNROLL, THREADS>(
|
|
args.NextBuffer + bufferOffset,
|
|
args.ThisOutput + outputOffset,
|
|
args.ThisBuffer + bufferOffset,
|
|
sliceSize);
|
|
}
|
|
__syncthreads();
|
|
|
|
outputOffset += sliceSize;
|
|
if (!PUSHRECV)
|
|
bufferOffset += sliceSize;
|
|
}
|
|
} else {
|
|
for(int s=0; s<NUM_SUBCHUNKS; ++s) {
|
|
__syncthreads();
|
|
SIGNAL_NEW_DATA_AVAILABLE(chunk, s, step);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!PUSHRECV) {
|
|
step = args.NumGPUs - 1;
|
|
block = GetBlock(args.ThisId, step, args.UserFromRing, args.NumGPUs);
|
|
outputOffset = chunkOffset + block * args.N;
|
|
bufferOffset = block * NUM_SUBCHUNKS * args.BufferSliceStride +
|
|
block * args.BufferMisalignedN;
|
|
|
|
// Make final copy from buffer to dest.
|
|
if (tid < THREADS) {
|
|
for(int s=0; s<NUM_SUBCHUNKS; ++s) {
|
|
getSliceSizeAndChunkSize(&sliceSize, s, numSlices, numBigSlices,
|
|
numSmallSlices, bigSliceN, smallSliceN, lastSliceN);
|
|
WAIT_FOR_NEW_DATA(chunk, s, step);
|
|
|
|
Copy<UNROLL, THREADS>(
|
|
args.ThisOutput + outputOffset,
|
|
args.ThisBuffer + bufferOffset,
|
|
sliceSize);
|
|
|
|
__syncthreads();
|
|
|
|
outputOffset += sliceSize;
|
|
bufferOffset += sliceSize;
|
|
}
|
|
} else {
|
|
for(int s=0; s<NUM_SUBCHUNKS; ++s) {
|
|
__syncthreads();
|
|
SIGNAL_CHUNK_DONE(chunk, s);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// wait for the last data to be pushed to us
|
|
if (tid < THREADS) {
|
|
if (PUSHRECV)
|
|
WAIT_FOR_NEW_DATA(args.NumChunks, NUM_SUBCHUNKS-1, 0);
|
|
else
|
|
WAIT_FOR_PREV_CHUNK(args.NumChunks, NUM_SUBCHUNKS-1);
|
|
|
|
if (tid == 0) {
|
|
args.ThisNewDataAvailableFlag[0] = 0;
|
|
args.ThisChunkDoneFlag[0] = 0;
|
|
*args.ThisPtrToNextOutput = nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
ncclResult_t ncclAllGatherWithType(const void* sendbuff, void* recvbuff,
|
|
int count, ncclComm* comm, int numUnroll, cudaStream_t stream) {
|
|
if (count == 0)
|
|
return ncclSuccess;
|
|
|
|
int index = comm->ncclId;
|
|
|
|
int blockSizeInBytes = count * sizeof(T);
|
|
int misalignedBytes = blockSizeInBytes % alignof(uint64_t);
|
|
|
|
assert((int)((misalignedBytes / sizeof(T)) * sizeof(T)) == misalignedBytes);
|
|
|
|
int misalignedN = misalignedBytes / sizeof(T);
|
|
assert(misalignedN < (int)(sizeof(uint64_t) / sizeof(T)));
|
|
|
|
int paddingN = (misalignedN > 0) ? sizeof(uint64_t) / sizeof(T) : 0;
|
|
|
|
// There is one slice per GPU, so a slice can be at most bufferN / numGPUs,
|
|
// where bufferN is the number of elements of type T that fit into the buffer.
|
|
int bufferN = comm->buffSize / sizeof(T);
|
|
// we only need buffer for k slices and k paddings
|
|
int bufferNPerSlice = (bufferN - comm->nDev * NUM_SUBCHUNKS * paddingN)
|
|
/ (comm->nDev * NUM_SUBCHUNKS);
|
|
// For efficiency, we want the slice size to be a multiple of UNROLL_SIZE
|
|
int maxSliceSize = (bufferNPerSlice / UNROLL_SIZE) * UNROLL_SIZE;
|
|
int nextId = (index + 1) % comm->nDev;
|
|
int prevId = (index + comm->nDev - 1) % comm->nDev;
|
|
|
|
AllGatherKernelArgs<T> args;
|
|
|
|
args.ThisId = index;
|
|
args.NumGPUs = comm->nDev;
|
|
args.N = count;
|
|
|
|
/* Block j is coming from sendbuff[j], which lives on device with logical
|
|
* index comm->ringFromUser[j]. But the block ordering does not necessarily
|
|
* follow the ring ordering. Hence the order in which a particular GPU
|
|
* processes the different blocks (the correspondence between the step in
|
|
* the reduction algorithm and the block on which a GPU operates in that
|
|
* particular step) is not the same as the ring order.
|
|
*
|
|
* Say we have 4 GPUs and comm->userFromRing = { 1, 2, 0, 3 }. Then there are 3
|
|
* step in the all-gather algorithm and block 0 comes from device 2, block 1
|
|
* from 0, block 2 from device 1, and block 3 comes from device 3. In the
|
|
* first step of the algorithm, each GPU must copy its own block from its
|
|
* sendbuff to the appropriate location in its recvbuff. The blocks that a
|
|
* GPU has to process in the next steps is determined by the previous step
|
|
* because each GPU only hands off data to the next GPU in the ring.
|
|
*
|
|
* In the above example, we get the following table of which block is
|
|
* processed by each GPU in a given step. The columns correspond to the
|
|
* different GPUs while the rows are the steps in the algorithm.
|
|
*
|
|
* GPU 0 1 2 3
|
|
* step
|
|
* 0 1 2 0 3
|
|
* 1 3 1 2 0
|
|
* 2 0 3 1 2
|
|
*
|
|
* We note the the rows in the above table are just comm->userFromRing in the
|
|
* first step and the list is cyclicly permuted to the right for each next
|
|
* step. The columns, which are what the individual GPUs need to know, are
|
|
* comm->userFromRing traversed backwards and starting at index k for GPU k.
|
|
* These columns are what we put into args.BlockVsStep to tell the GPU which
|
|
* block it needs to be processing at a particular step. */
|
|
args.UserFromRing = comm->devUserFromRing;
|
|
|
|
args.SliceSize = numUnroll * UNROLL_SIZE * sizeof(PackType) / sizeof(T);
|
|
args.SliceSize = std::min(maxSliceSize, args.SliceSize);
|
|
args.ChunkSize = NUM_SUBCHUNKS * args.SliceSize;
|
|
|
|
// don't reduce this if we cut the slice size in half below, because if that
|
|
// happens, the last chunk will be larger than the other chunks, and we will
|
|
// need the extra buffer space
|
|
args.BufferSliceStride = args.SliceSize + paddingN;
|
|
|
|
args.BufferMisalignedN = misalignedN;
|
|
|
|
// avoid a case where we have one or more big chunks and one tiny one
|
|
int remainder = args.N % args.ChunkSize;
|
|
if ((args.N > args.ChunkSize) && (remainder > 0) &&
|
|
(args.N < 5 * args.ChunkSize) && (2 * remainder < args.ChunkSize)) {
|
|
args.SliceSize /= 2;
|
|
args.ChunkSize = NUM_SUBCHUNKS * args.SliceSize;
|
|
|
|
// round down so we end up with a big last chunk
|
|
args.NumChunks = args.N / args.ChunkSize;
|
|
} else {
|
|
// round up
|
|
args.NumChunks = (args.N + args.ChunkSize - 1) / args.ChunkSize;
|
|
}
|
|
|
|
args.ThisPtrToNextOutput = (T**)&(comm->local[nextId]->recvPtrs[0]);
|
|
args.PrevPtrToThisOutput = (T**)&(comm->remote[prevId]->recvPtrs[0]);
|
|
|
|
args.ThisInput = (const T*)sendbuff;
|
|
args.ThisOutput = (volatile T*)recvbuff;
|
|
args.ThisBuffer = (volatile T*)comm->local[prevId]->buff;
|
|
args.NextBuffer = (volatile T*)comm->remote[nextId]->buff;
|
|
|
|
args.ThisNewDataAvailableFlag = comm->local[prevId]->flags;
|
|
args.NextNewDataAvailableFlag = comm->remote[nextId]->flags;
|
|
args.ThisChunkDoneFlag = comm->local[nextId]->flags + 1;
|
|
args.PrevChunkDoneFlag = comm->remote[prevId]->flags + 1;
|
|
|
|
if( comm->useRemoteRecv ) {
|
|
AllGatherKernel<NUM_THREADS, UNROLL_COUNT, true, T>
|
|
<<<1, NUM_THREADS + 1, 0, stream>>>(args);
|
|
} else {
|
|
AllGatherKernel<NUM_THREADS, UNROLL_COUNT, false, T>
|
|
<<<1, NUM_THREADS + 1, 0, stream>>>(args);
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
class AllGatherFunctor {
|
|
public:
|
|
ncclResult_t operator()(const void* sendbuff, void* recvbuff,
|
|
int count, ncclDataType_t datatype, ncclRedOp_t /*dummy operation*/,
|
|
int /*dummy root*/, ncclComm* comm, cudaStream_t stream) {
|
|
int numUnroll = 16; // this is optimal on dt07 with 4 GPUs
|
|
|
|
switch (datatype) {
|
|
case ncclChar:
|
|
return ncclAllGatherWithType<char>(sendbuff, recvbuff, count, comm,
|
|
numUnroll, stream);
|
|
case ncclInt:
|
|
return ncclAllGatherWithType<int>(sendbuff, recvbuff, count, comm,
|
|
numUnroll, stream);
|
|
#if CUDART_VERSION >= 7050
|
|
case ncclHalf:
|
|
return ncclAllGatherWithType<half>(sendbuff, recvbuff, count, comm,
|
|
numUnroll, stream);
|
|
#endif
|
|
case ncclFloat:
|
|
return ncclAllGatherWithType<float>(sendbuff, recvbuff, count, comm,
|
|
numUnroll, stream);
|
|
case ncclDouble:
|
|
return ncclAllGatherWithType<double>(sendbuff, recvbuff, count, comm,
|
|
numUnroll, stream);
|
|
case ncclInt64:
|
|
return ncclAllGatherWithType<long long>(sendbuff, recvbuff, count, comm,
|
|
numUnroll, stream);
|
|
case ncclUint64:
|
|
return ncclAllGatherWithType<unsigned long long>(sendbuff, recvbuff, count, comm,
|
|
numUnroll, stream);
|
|
}
|
|
return ncclInvalidType;
|
|
}
|
|
};
|
|
|
|
extern "C" DSOGLOBAL
|
|
ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
|
|
void* recvbuff, ncclComm_t comm, cudaStream_t stream) {
|
|
return enqueue(AllGatherFunctor(), sendbuff, recvbuff, count, datatype,
|
|
ncclSum, 0, comm, stream);
|
|
}
|