68b542363f
Add scalable init API * Add new ncclCommInitRankScalable to allow for passing multiple unique IDs to the init function. * Spreads the load onto multiple bootstrap roots, allowing for constant bootstrap time. * Requires multiple ranks to create a unique ID, and the CPU-side ID exchange code to call allgather[v] instead of broadcast. Accelerate init bootstrap operations * Reduce the number of calls to allgather. * Allow roots to reply early to ranks when information is already available. * Add an option to use ncclNet instead of sockets to perform bootstrap allgather operations. Add PAT algorithms for Allgather and ReduceScatter * Parallel Aggregated Trees, variation of Bruck algorithm. * Logarithmic number of network steps for small sizes at scale. * Only supports one rank per node at the moment. Add support for registered buffers for intra-node communication. * Allow registered user buffers to be accessed directly intra-node * Avoids extra copies in algorithms which permit it, saving memory bandwidth and helping with compute overlap. Add profiler plugin API * New plugin API for profiling * Supports various levels of profiling, with a hierarchy. Asynchronous graph allocation * Make calls to cudaMalloc and cudaMemcpy during graph allocation asynchronous. * Significantly speeds up graph capture. Use fatal IB asynchronous events to stop network operation * Avoids many other error messages * Only fatal errors are affected; potentially transient errors (e.g. port down) do not cause an immediate stop. Set P2P level to PXB on AMD CPUs when using more than 2 GPUs per node * P2P would cause a significant performance degradation when using many GPUs, and therefore many interleaved data flows. * Disable P2P through the CPU when we have 3+ GPUs per node; keep it enabled when we only have 2 GPUs. Improve the init logs to report the real NCCL function. * Make the log report ncclCommInitRank or ncclCommSplit, rather than the generic ncclCommInitRankFunc. Add a parameter to set the location of the user configuration file. * Add NCCL_CONF_FILE environment variable to set where the user's configuration file resides. Increase default IB timeout * Increase IB timeout value from 18 to 20. * Should help avoid fatal errors on large RoCE systems. Add new check for nvidia peermem * On linux kernels 6.6+, /sys/kernel/mm/memory_peers is no longer present; check for /sys/module/nvidia_peermem/version instead. Fix old performance regression when mixing small and large operations. * Improves distribution of work on channels. Fix crash when NUMA IDs are equal to -1. * Can happen when a NIC is a virtual NIC, or when linux doesn't know which NUMA node a device is attached to * Issue NVIDIA/nccl-tests#233 Fix tree graph search when NCCL_CROSS_NIC is set to 1. * Would force NCCL to use the balanced_tree pattern, thereby disabling LL128 on platforms with 1 GPU+1 NIC per PCI switch. * Would also try to use alternate rings even though it was not needed. Compiler tweaks and fixes * PR #1177 * PR #1228 Fix stack smash * PR #1325 Fixes for multi-node NVLink + IB operation Coverity fixes and comments.
418 líneas
16 KiB
C++
418 líneas
16 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_ALLOC_H_
|
|
#define NCCL_ALLOC_H_
|
|
|
|
#include "nccl.h"
|
|
#include "checks.h"
|
|
#include "bitops.h"
|
|
#include "utils.h"
|
|
#include "p2p.h"
|
|
#include <sys/mman.h>
|
|
#include <unistd.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#if CUDART_VERSION >= 11030
|
|
#include <cuda.h>
|
|
#include "cudawrap.h"
|
|
#endif
|
|
|
|
uint64_t clockNano(); // from utils.h with which we have a circular dependency
|
|
|
|
template<typename T>
|
|
constexpr size_t ncclSizeOfT() { return sizeof(T); }
|
|
template<>
|
|
constexpr size_t ncclSizeOfT<void>() { return 1; }
|
|
|
|
#if CUDART_VERSION >= 12020
|
|
|
|
static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
|
|
ncclResult_t result = ncclSuccess;
|
|
size_t granularity = 0;
|
|
CUdevice currentDev;
|
|
CUmemAllocationProp prop = {};
|
|
CUmemAccessDesc accessDesc = {};
|
|
CUmemGenericAllocationHandle handle;
|
|
int cudaDev;
|
|
int cpuNumaNodeId = -1;
|
|
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
|
|
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
|
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
|
CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
|
|
if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
|
|
prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
|
|
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
|
prop.requestedHandleTypes = type; // So it can be exported
|
|
prop.location.id = cpuNumaNodeId;
|
|
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
|
ALIGN_SIZE(size, granularity);
|
|
/* Allocate the physical memory on the device */
|
|
CUCHECK(cuMemCreate(&handle, size, &prop, 0));
|
|
/* Reserve a virtual address range */
|
|
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, granularity, 0, 0));
|
|
/* Map the virtual address range to the physical allocation */
|
|
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
|
|
/* Now allow RW access to the newly mapped memory for local GPU */
|
|
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
accessDesc.location.id = cudaDev;
|
|
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
|
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
|
|
|
|
/* Now allow RW access to the newly mapped memory from the CPU */
|
|
accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
|
|
accessDesc.location.id = cpuNumaNodeId;
|
|
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
|
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
|
|
|
|
if (handlep) *handlep = handle;
|
|
INFO(NCCL_ALLOC, "CUMEM Host Alloc Size %zi pointer %p handle %llx numa %d dev %d granularity %ld", size, *ptr, handle, cpuNumaNodeId, cudaDev, granularity);
|
|
return result;
|
|
}
|
|
|
|
static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
|
|
if (ptr == NULL) return ncclSuccess;
|
|
ncclResult_t result = ncclSuccess;
|
|
CUmemGenericAllocationHandle handle;
|
|
size_t size = 0;
|
|
CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
|
|
CUCHECK(cuMemRelease(handle));
|
|
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
|
|
TRACE(NCCL_ALLOC, "CUMEM Host Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
|
|
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
|
|
CUCHECK(cuMemRelease(handle));
|
|
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
|
|
return result;
|
|
}
|
|
|
|
#else /* CUDART_VERSION >= 12020 */
|
|
|
|
static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, void* handlep, size_t size) {
|
|
WARN("CUMEM Host is not supported prior to CUDA 12.2");
|
|
return ncclInternalError;
|
|
}
|
|
|
|
static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
|
|
WARN("CUMEM Host is not supported prior to CUDA 12.2");
|
|
return ncclInternalError;
|
|
}
|
|
|
|
#endif /* CUDART_VERSION >= 12020 */
|
|
|
|
template <typename T>
|
|
ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
|
ncclResult_t result = ncclSuccess;
|
|
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
|
*ptr = nullptr;
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
if (nelem > 0) {
|
|
CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped), result, finish);
|
|
memset(*ptr, 0, nelem*ncclSizeOfT<T>());
|
|
}
|
|
finish:
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", nelem*ncclSizeOfT<T>());
|
|
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
|
|
return result;
|
|
}
|
|
|
|
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
|
CUDACHECK(cudaFreeHost(ptr));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
|
|
|
template <typename T>
|
|
ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
|
if (nelem > 0) {
|
|
T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
|
|
if (p == NULL) {
|
|
WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
|
|
return ncclSystemError;
|
|
}
|
|
//INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), p);
|
|
memset(p, 0, nelem*ncclSizeOfT<T>());
|
|
*ptr = p;
|
|
} else {
|
|
*ptr = NULL;
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
|
|
|
template <typename T>
|
|
ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
|
|
T* oldp = *ptr;
|
|
if (nelem < oldNelem || (oldp == NULL && oldNelem > 0)) return ncclInternalError;
|
|
if (nelem == oldNelem) return ncclSuccess;
|
|
|
|
T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
|
|
if (p == NULL) {
|
|
WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
|
|
return ncclSystemError;
|
|
}
|
|
if (oldp && oldNelem) memcpy(p, oldp, oldNelem * ncclSizeOfT<T>());
|
|
if (oldp) free(oldp);
|
|
memset(p+oldNelem, 0, (nelem-oldNelem)*ncclSizeOfT<T>());
|
|
*ptr = (T*)p;
|
|
INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*ncclSizeOfT<T>(), nelem*ncclSizeOfT<T>(), *ptr);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
#if CUDART_VERSION >= 11030
|
|
|
|
#include <cuda.h>
|
|
#include "cudawrap.h"
|
|
|
|
// ncclCuMemAllocAddr takes memory handle and size and returns the mapped address pointer
|
|
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
|
|
ncclResult_t result = ncclSuccess;
|
|
size_t granularity = 0;
|
|
CUmemAllocationProp prop = {};
|
|
CUmemAccessDesc accessDesc = {};
|
|
int cudaDev;
|
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
|
CUCHECK(cuMemGetAllocationPropertiesFromHandle(&prop, *handleIn));
|
|
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
|
ALIGN_SIZE(size, granularity);
|
|
/* Reserve a virtual address range */
|
|
CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
|
|
/* Map the virtual address range to the physical allocation */
|
|
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, *handleIn, 0));
|
|
/* Now allow RW access to the newly mapped memory */
|
|
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
accessDesc.location.id = cudaDev;
|
|
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
|
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
|
|
TRACE(NCCL_ALLOC, "CuMem Map Size %zu pointer %p handle %llx", size, *ptr, *handleIn);
|
|
return result;
|
|
}
|
|
|
|
static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
|
|
if (ptr == NULL) return ncclSuccess;
|
|
ncclResult_t result = ncclSuccess;
|
|
size_t size = 0;
|
|
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
|
|
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
|
|
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
|
|
return result;
|
|
}
|
|
|
|
static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
|
|
ncclResult_t result = ncclSuccess;
|
|
size_t granularity = 0;
|
|
CUdevice currentDev;
|
|
CUmemAllocationProp prop = {};
|
|
CUmemAccessDesc accessDesc = {};
|
|
CUmemGenericAllocationHandle handle;
|
|
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
|
int cudaDev;
|
|
int flag = 0;
|
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
|
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
|
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
|
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
prop.requestedHandleTypes = type;
|
|
prop.location.id = currentDev;
|
|
// Query device to see if RDMA support is available
|
|
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
|
|
if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
|
|
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
|
ALIGN_SIZE(size, granularity);
|
|
/* Allocate the physical memory on the device */
|
|
CUCHECK(cuMemCreate(&handle, size, &prop, 0));
|
|
/* Reserve a virtual address range */
|
|
CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
|
|
/* Map the virtual address range to the physical allocation */
|
|
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
|
|
/* Now allow RW access to the newly mapped memory */
|
|
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
accessDesc.location.id = currentDev;
|
|
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
|
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
|
|
if (handlep) *handlep = handle;
|
|
TRACE(NCCL_ALLOC, "CuMem Alloc Size %zu pointer %p handle %llx", size, *ptr, handle);
|
|
return result;
|
|
}
|
|
|
|
static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
|
if (ptr == NULL) return ncclSuccess;
|
|
ncclResult_t result = ncclSuccess;
|
|
CUmemGenericAllocationHandle handle;
|
|
size_t size = 0;
|
|
CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
|
|
CUCHECK(cuMemRelease(handle));
|
|
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
|
|
TRACE(NCCL_ALLOC, "CuMem Free Size %zu pointer %p handle 0x%llx", size, ptr, handle);
|
|
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
|
|
CUCHECK(cuMemRelease(handle));
|
|
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
|
|
return result;
|
|
}
|
|
|
|
#else
|
|
|
|
extern int ncclCuMemEnable();
|
|
|
|
static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
|
|
WARN("CUMEM not supported prior to CUDA 11.3");
|
|
return ncclInternalError;
|
|
}
|
|
static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
|
WARN("CUMEM not supported prior to CUDA 11.3");
|
|
return ncclInternalError;
|
|
}
|
|
|
|
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
|
|
WARN("CUMEM not supported prior to CUDA 11.3");
|
|
return ncclInternalError;
|
|
}
|
|
|
|
static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
|
|
WARN("CUMEM not supported prior to CUDA 11.3");
|
|
return ncclInternalError;
|
|
}
|
|
#endif
|
|
|
|
template <typename T>
|
|
ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
|
ncclResult_t result = ncclSuccess;
|
|
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
|
*ptr = nullptr;
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
if (nelem > 0) {
|
|
if (ncclCuMemEnable()) {
|
|
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
|
|
} else {
|
|
CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
|
|
}
|
|
}
|
|
finish:
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA malloc %ld bytes", nelem*ncclSizeOfT<T>());
|
|
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
|
|
return result;
|
|
}
|
|
#define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
|
|
|
template <typename T>
|
|
ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
|
ncclResult_t result = ncclSuccess;
|
|
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
|
*ptr = nullptr;
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
if (nelem > 0) {
|
|
// Need a side stream so as not to interfere with graph capture.
|
|
cudaStream_t stream;
|
|
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
|
if (ncclCuMemEnable()) {
|
|
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
|
|
} else {
|
|
CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
|
|
}
|
|
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
|
|
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
|
|
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
|
|
}
|
|
finish:
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc %ld bytes", nelem*ncclSizeOfT<T>());
|
|
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
|
|
return result;
|
|
}
|
|
#define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
|
|
|
template <typename T>
|
|
ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) {
|
|
ncclResult_t result = ncclSuccess;
|
|
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
|
*ptr = nullptr;
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
if (nelem > 0) {
|
|
if (ncclCuMemEnable()) {
|
|
NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
|
|
} else {
|
|
CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
|
|
}
|
|
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
|
|
}
|
|
finish:
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc async %ld bytes", nelem*ncclSizeOfT<T>());
|
|
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
|
|
return result;
|
|
}
|
|
#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
|
|
|
|
template <typename T>
|
|
ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
|
ncclResult_t result = ncclSuccess;
|
|
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
// Need a side stream so as not to interfere with graph capture.
|
|
cudaStream_t stream;
|
|
CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish);
|
|
NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
|
|
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
|
|
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
|
|
finish:
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
return result;
|
|
}
|
|
|
|
template <typename T>
|
|
ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) {
|
|
ncclResult_t result = ncclSuccess;
|
|
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*ncclSizeOfT<T>(), cudaMemcpyDefault, stream), result, finish);
|
|
finish:
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
return result;
|
|
}
|
|
|
|
template <typename T>
|
|
ncclResult_t ncclCudaFree(T* ptr) {
|
|
ncclResult_t result = ncclSuccess;
|
|
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
|
TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr);
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
if (ncclCuMemEnable()) {
|
|
NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish);
|
|
} else {
|
|
CUDACHECKGOTO(cudaFree(ptr), result, finish);
|
|
}
|
|
finish:
|
|
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
|
return result;
|
|
}
|
|
|
|
// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
|
|
// allocated on separate pages as those pages will be marked DONTFORK
|
|
// and if they are shared, that could cause a crash in a child process
|
|
inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
|
|
if (size > 0) {
|
|
long page_size = sysconf(_SC_PAGESIZE);
|
|
if (page_size < 0) return ncclSystemError;
|
|
void* p;
|
|
int size_aligned = ROUNDUP(size, page_size);
|
|
int ret = posix_memalign(&p, page_size, size_aligned);
|
|
if (ret != 0) return ncclSystemError;
|
|
memset(p, 0, size);
|
|
*ptr = p;
|
|
} else {
|
|
*ptr = NULL;
|
|
}
|
|
INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
|
|
return ncclSuccess;
|
|
}
|
|
#define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
|
|
|
#endif
|