7e51592129
Add support for bfloat16. Add ncclAvg reduction operation. Improve performance for aggregated operations. Improve performance for tree. Improve network error reporting. Add NCCL_NET parameter to force a specific network. Add NCCL_IB_QPS_PER_CONNECTION parameter to split IB traffic onto multiple queue pairs. Fix topology detection error in WSL2. Fix proxy memory elements affinity (improve alltoall performance). Fix graph search on cubemesh topologies. Fix hang in cubemesh during NVB connections.
80 regels
3.0 KiB
C++
80 regels
3.0 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_ALLOC_H_
|
|
#define NCCL_ALLOC_H_
|
|
|
|
#include "nccl.h"
|
|
#include "checks.h"
|
|
#include "align.h"
|
|
#include <sys/mman.h>
|
|
|
|
template <typename T>
|
|
static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
|
CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped));
|
|
memset(*ptr, 0, nelem*sizeof(T));
|
|
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
|
return ncclSuccess;
|
|
}
|
|
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
|
|
|
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
|
CUDACHECK(cudaFreeHost(ptr));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
template <typename T>
|
|
static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
|
void* p = malloc(nelem*sizeof(T));
|
|
if (p == NULL) {
|
|
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
|
|
return ncclSystemError;
|
|
}
|
|
memset(p, 0, nelem*sizeof(T));
|
|
*ptr = (T*)p;
|
|
INFO(NCCL_ALLOC, "%s:%d Mem Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
|
return ncclSuccess;
|
|
}
|
|
#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
|
|
|
template <typename T>
|
|
static ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
|
// Need async stream for P2P pre-connect + CUDA Graph
|
|
cudaStream_t stream;
|
|
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
|
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
|
|
CUDACHECK(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
|
|
CUDACHECK(cudaStreamSynchronize(stream));
|
|
CUDACHECK(cudaStreamDestroy(stream));
|
|
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
|
return ncclSuccess;
|
|
}
|
|
#define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
|
|
|
template <typename T>
|
|
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
|
CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
|
|
// allocated on separate pages as those pages will be marked DONTFORK
|
|
// and if they are shared, that could cause a crash in a child process
|
|
static ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
|
|
size_t page_size = sysconf(_SC_PAGESIZE);
|
|
void* p;
|
|
int size_aligned = ROUNDUP(size, page_size);
|
|
int ret = posix_memalign(&p, page_size, size_aligned);
|
|
if (ret != 0) return ncclSystemError;
|
|
memset(p, 0, size);
|
|
*ptr = p;
|
|
INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
|
|
return ncclSuccess;
|
|
}
|
|
#define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
|
|
|
#endif
|