Add support for CUDA graphs.
Fuse BCM Gen4 switches to avoid suboptimal performance on some platforms. Issue #439.
Fix bootstrap issue caused by connection reordering.
Fix CPU locking block.
Improve CollNet algorithm.
Improve performance on DGX A100 for communicators with only one GPU per node.
This commit is contained in:
Sylvain Jeaugey
2021-04-12 16:00:11 -07:00
förälder 911d61f214
incheckning a46ea10583
43 ändrade filer med 2687 tillägg och 1244 borttagningar
+7 -2
Visa fil
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -38,8 +38,13 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
template <typename T>
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
// Need async stream for P2P pre-connect + CUDA Graph
cudaStream_t stream;
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
CUDACHECK(cudaStreamSynchronize(stream));
CUDACHECK(cudaStreamDestroy(stream));
return ncclSuccess;
}