d97a32fac8
Add support for IB SHARP to NVLS (NVLink SHARP algorithm). Add NVLS+Tree algorithm. Add support for memory management using cuMem* functions. Use all NICs for Send/Receive operations on systems with more than one NIC per GPU (#804). Add ncclCommSplit primitive, with resource sharing option in config. Fix alltoallv hang (#788) Increase number of channels on H100 when we're not limited by NVLink. Improve error reporting in case of IB failure, printing local and remote ID (#779). Add build option to allow compilation against RDMA includes instead of dynamically loading IB verbs symbols (#802). Fix context creation for progress thread (#803). NET/IB: add option to use multiple QPs in round-robin mode. Fix tree performance issue when NVB is disabled on HCM topologies.
248 baris
7.9 KiB
C++
248 baris
7.9 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "nccl.h"
|
|
#include "debug.h"
|
|
#include "param.h"
|
|
#include "cudawrap.h"
|
|
|
|
#include <dlfcn.h>
|
|
|
|
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
|
|
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", 0);
|
|
|
|
static int ncclCuMemSupported = 0;
|
|
|
|
// Determine whether CUMEM & VMM RDMA is supported on this platform
|
|
int ncclIsCuMemSupported() {
|
|
#if CUDART_VERSION < 11030
|
|
return 0;
|
|
#else
|
|
CUdevice currentDev;
|
|
int cudaDev;
|
|
int cudaDriverVersion;
|
|
int flag = 0;
|
|
ncclResult_t ret = ncclSuccess;
|
|
CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error);
|
|
if (cudaDriverVersion < 12000) return 0; // Need CUDA_VISIBLE_DEVICES support
|
|
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error);
|
|
if (CUPFN(cuMemCreate) == NULL) return 0;
|
|
CUCHECKGOTO(cuDeviceGet(¤tDev, cudaDev), ret, error);
|
|
// Query device to see if CUMEM VMM support is available
|
|
CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error);
|
|
if (!flag) return 0;
|
|
// Query device to see if CUMEM RDMA support is available
|
|
CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev), ret, error);
|
|
if (!flag) return 0;
|
|
error:
|
|
return (ret == ncclSuccess);
|
|
#endif
|
|
}
|
|
|
|
int ncclCuMemEnable() {
|
|
return ((ncclParamCuMemEnable() == -2 && ncclCuMemSupported) || ncclParamCuMemEnable());
|
|
}
|
|
|
|
#define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
|
|
|
|
#if CUDART_VERSION >= 11030
|
|
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
|
|
DECLARE_CUDA_PFN(cuDeviceGet, 2000);
|
|
DECLARE_CUDA_PFN(cuDeviceGetAttribute, 2000);
|
|
DECLARE_CUDA_PFN(cuGetErrorString, 6000);
|
|
DECLARE_CUDA_PFN(cuGetErrorName, 6000);
|
|
/* enqueue.cc */
|
|
DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020);
|
|
/* proxy.cc */
|
|
DECLARE_CUDA_PFN(cuCtxCreate, 3020);
|
|
DECLARE_CUDA_PFN(cuCtxDestroy, 4000);
|
|
DECLARE_CUDA_PFN(cuCtxGetCurrent, 4000);
|
|
DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000);
|
|
DECLARE_CUDA_PFN(cuCtxGetDevice, 2000);
|
|
/* cuMem API support */
|
|
DECLARE_CUDA_PFN(cuMemAddressReserve, 10020);
|
|
DECLARE_CUDA_PFN(cuMemAddressFree, 10020);
|
|
DECLARE_CUDA_PFN(cuMemCreate, 10020);
|
|
DECLARE_CUDA_PFN(cuMemGetAllocationGranularity, 10020);
|
|
DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020);
|
|
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020);
|
|
DECLARE_CUDA_PFN(cuMemMap, 10020);
|
|
DECLARE_CUDA_PFN(cuMemRelease, 10020);
|
|
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000);
|
|
DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
|
|
DECLARE_CUDA_PFN(cuMemUnmap, 10020);
|
|
#if CUDA_VERSION >= 11070
|
|
/* transport/collNet.cc/net.cc*/
|
|
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
|
|
#endif
|
|
#if CUDA_VERSION >= 12010
|
|
/* NVSwitch Multicast support */
|
|
DECLARE_CUDA_PFN(cuMulticastAddDevice, 12010);
|
|
DECLARE_CUDA_PFN(cuMulticastBindMem, 12010);
|
|
DECLARE_CUDA_PFN(cuMulticastBindAddr, 12010);
|
|
DECLARE_CUDA_PFN(cuMulticastCreate, 12010);
|
|
DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010);
|
|
DECLARE_CUDA_PFN(cuMulticastUnbind, 12010);
|
|
#endif
|
|
#endif
|
|
|
|
/* CUDA Driver functions loaded with dlsym() */
|
|
DECLARE_CUDA_PFN(cuInit, 2000);
|
|
DECLARE_CUDA_PFN(cuDriverGetVersion, 2020);
|
|
DECLARE_CUDA_PFN(cuGetProcAddress, 11030);
|
|
|
|
#define CUDA_DRIVER_MIN_VERSION 11030
|
|
|
|
static void *cudaLib;
|
|
int ncclCudaDriverVersionCache = -1;
|
|
bool ncclCudaLaunchBlocking = false;
|
|
|
|
#if CUDART_VERSION >= 11030
|
|
/*
|
|
Load the CUDA symbols
|
|
*/
|
|
static ncclResult_t cudaPfnFuncLoader(void) {
|
|
CUresult res;
|
|
|
|
#define LOAD_SYM(symbol, version, ignore) do { \
|
|
res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), version, 0); \
|
|
if (res != 0) { \
|
|
if (!ignore) { \
|
|
WARN("Retrieve %s version %d failed with %d", #symbol, version, res); \
|
|
return ncclSystemError; } \
|
|
} } while(0)
|
|
|
|
LOAD_SYM(cuGetErrorString, 6000, 0);
|
|
LOAD_SYM(cuGetErrorName, 6000, 0);
|
|
LOAD_SYM(cuDeviceGet, 2000, 0);
|
|
LOAD_SYM(cuDeviceGetAttribute, 2000, 0);
|
|
LOAD_SYM(cuMemGetAddressRange, 3020, 1);
|
|
LOAD_SYM(cuCtxCreate, 3020, 1);
|
|
LOAD_SYM(cuCtxDestroy, 4000, 1);
|
|
LOAD_SYM(cuCtxGetCurrent, 4000, 1);
|
|
LOAD_SYM(cuCtxSetCurrent, 4000, 1);
|
|
LOAD_SYM(cuCtxGetDevice, 2000, 1);
|
|
/* cuMem API support */
|
|
LOAD_SYM(cuMemAddressReserve, 10020, 1);
|
|
LOAD_SYM(cuMemAddressFree, 10020, 1);
|
|
LOAD_SYM(cuMemCreate, 10020, 1);
|
|
LOAD_SYM(cuMemGetAllocationGranularity, 10020, 1);
|
|
LOAD_SYM(cuMemExportToShareableHandle, 10020, 1);
|
|
LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1);
|
|
LOAD_SYM(cuMemMap, 10020, 1);
|
|
LOAD_SYM(cuMemRelease, 10020, 1);
|
|
LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1);
|
|
LOAD_SYM(cuMemSetAccess, 10020, 1);
|
|
LOAD_SYM(cuMemUnmap, 10020, 1);
|
|
#if CUDA_VERSION >= 11070
|
|
LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
|
|
#endif
|
|
#if CUDA_VERSION >= 12010
|
|
/* NVSwitch Multicast support */
|
|
LOAD_SYM(cuMulticastAddDevice, 12010, 1);
|
|
LOAD_SYM(cuMulticastBindMem, 12010, 1);
|
|
LOAD_SYM(cuMulticastBindAddr, 12010, 1);
|
|
LOAD_SYM(cuMulticastCreate, 12010, 1);
|
|
LOAD_SYM(cuMulticastGetGranularity, 12010, 1);
|
|
LOAD_SYM(cuMulticastUnbind, 12010, 1);
|
|
#endif
|
|
return ncclSuccess;
|
|
}
|
|
#endif
|
|
|
|
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
|
static ncclResult_t initResult;
|
|
|
|
static void initOnceFunc() {
|
|
do {
|
|
char* val = getenv("CUDA_LAUNCH_BLOCKING");
|
|
ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
|
|
} while (0);
|
|
|
|
CUresult res;
|
|
/*
|
|
* Load CUDA driver library
|
|
*/
|
|
char path[1024];
|
|
char *ncclCudaPath = getenv("NCCL_CUDA_PATH");
|
|
if (ncclCudaPath == NULL)
|
|
snprintf(path, 1024, "%s", "libcuda.so");
|
|
else
|
|
snprintf(path, 1024, "%s/%s", ncclCudaPath, "libcuda.so");
|
|
|
|
(void) dlerror(); // Clear any previous errors
|
|
cudaLib = dlopen(path, RTLD_LAZY);
|
|
if (cudaLib == NULL) {
|
|
WARN("Failed to find CUDA library %s (NCCL_CUDA_PATH='%s') : %s", path, ncclCudaPath ? ncclCudaPath : "", dlerror());
|
|
goto error;
|
|
}
|
|
|
|
/*
|
|
* Load initial CUDA functions
|
|
*/
|
|
|
|
pfn_cuInit = (PFN_cuInit_v2000) dlsym(cudaLib, "cuInit");
|
|
if (pfn_cuInit == NULL) {
|
|
WARN("Failed to load CUDA missing symbol cuInit");
|
|
goto error;
|
|
}
|
|
|
|
pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion_v2020) dlsym(cudaLib, "cuDriverGetVersion");
|
|
if (pfn_cuDriverGetVersion == NULL) {
|
|
WARN("Failed to load CUDA missing symbol cuDriverGetVersion");
|
|
goto error;
|
|
}
|
|
|
|
int driverVersion;
|
|
res = pfn_cuDriverGetVersion(&driverVersion);
|
|
if (res != 0) {
|
|
WARN("cuDriverGetVersion failed with %d", res);
|
|
goto error;
|
|
}
|
|
|
|
INFO(NCCL_INIT, "cudaDriverVersion %d", driverVersion);
|
|
|
|
if (driverVersion < CUDA_DRIVER_MIN_VERSION) {
|
|
// WARN("CUDA Driver version found is %d. Minimum requirement is %d", driverVersion, CUDA_DRIVER_MIN_VERSION);
|
|
// Silently ignore version check mismatch for backwards compatibility
|
|
goto error;
|
|
}
|
|
|
|
pfn_cuGetProcAddress = (PFN_cuGetProcAddress_v11030) dlsym(cudaLib, "cuGetProcAddress");
|
|
if (pfn_cuGetProcAddress == NULL) {
|
|
WARN("Failed to load CUDA missing symbol cuGetProcAddress");
|
|
goto error;
|
|
}
|
|
|
|
/*
|
|
* Required to initialize the CUDA Driver.
|
|
* Multiple calls of cuInit() will return immediately
|
|
* without making any relevant change
|
|
*/
|
|
pfn_cuInit(0);
|
|
|
|
#if CUDART_VERSION >= 11030
|
|
if (cudaPfnFuncLoader()) {
|
|
WARN("CUDA some PFN functions not found in the library");
|
|
goto error;
|
|
}
|
|
#endif
|
|
|
|
// Determine whether we support the cuMem APIs or not
|
|
ncclCuMemSupported = ncclIsCuMemSupported();
|
|
|
|
initResult = ncclSuccess;
|
|
return;
|
|
error:
|
|
initResult = ncclSystemError;
|
|
return;
|
|
}
|
|
|
|
ncclResult_t ncclCudaLibraryInit() {
|
|
pthread_once(&initOnceControl, initOnceFunc);
|
|
return initResult;
|
|
}
|