2025-05-29 20:56:40 -07:00
|
|
|
/*************************************************************************
|
|
|
|
|
* Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
|
|
|
|
|
*
|
|
|
|
|
* See LICENSE.txt for license information
|
|
|
|
|
************************************************************************/
|
|
|
|
|
|
|
|
|
|
#include "comm.h"
|
|
|
|
|
#include "transport.h"
|
|
|
|
|
#include "group.h"
|
|
|
|
|
|
|
|
|
|
NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
|
2025-08-28 15:45:42 -05:00
|
|
|
ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
|
2025-05-29 20:56:40 -07:00
|
|
|
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
|
|
|
|
ncclResult_t ret = ncclSuccess;
|
|
|
|
|
|
2025-08-28 15:45:42 -05:00
|
|
|
#if ROCM_VERSION >= 70000
|
2025-05-29 20:56:40 -07:00
|
|
|
size_t memGran = 0;
|
|
|
|
|
CUdevice currentDev;
|
|
|
|
|
CUmemAllocationProp memprop = {};
|
|
|
|
|
CUmemAccessDesc accessDesc = {};
|
|
|
|
|
CUmemGenericAllocationHandle handle = (CUmemGenericAllocationHandle)-1;
|
|
|
|
|
int cudaDev;
|
|
|
|
|
int flag;
|
|
|
|
|
int dcnt;
|
|
|
|
|
|
|
|
|
|
if (ptr == NULL || size == 0) goto fallback;
|
|
|
|
|
|
2025-08-28 15:45:42 -05:00
|
|
|
// if (rocmLibraryInit() != ncclSuccess) goto fallback;
|
|
|
|
|
rocmLibraryInit();
|
2025-05-29 20:56:40 -07:00
|
|
|
|
|
|
|
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
|
|
|
|
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
|
|
|
|
|
|
|
|
|
if (ncclCuMemEnable()) {
|
|
|
|
|
size_t handleSize = size;
|
|
|
|
|
int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
|
|
|
|
// Query device to see if FABRIC handle support is available
|
|
|
|
|
flag = 0;
|
|
|
|
|
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));
|
|
|
|
|
if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
|
|
|
|
|
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
|
|
|
|
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
|
|
|
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
|
|
|
|
|
memprop.location.id = currentDev;
|
|
|
|
|
// Query device to see if RDMA support is available
|
|
|
|
|
flag = 0;
|
2025-08-28 15:45:42 -05:00
|
|
|
// CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
|
2025-05-29 20:56:40 -07:00
|
|
|
if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
|
|
|
|
|
CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
|
|
|
|
CUDACHECK(cudaGetDeviceCount(&dcnt));
|
|
|
|
|
ALIGN_SIZE(handleSize, memGran);
|
|
|
|
|
|
|
|
|
|
if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
|
|
|
|
|
/* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
|
|
|
|
|
CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0));
|
2025-08-28 15:45:42 -05:00
|
|
|
if (err == CUDA_ERROR_NOT_SUPPORTED) {
|
2025-05-29 20:56:40 -07:00
|
|
|
requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
|
|
|
|
|
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
|
|
|
|
|
/* Allocate the physical memory on the device */
|
|
|
|
|
CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
|
|
|
|
|
} else if (err != CUDA_SUCCESS) {
|
|
|
|
|
// Catch and report any error from above
|
|
|
|
|
CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
/* Allocate the physical memory on the device */
|
|
|
|
|
CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
|
|
|
|
|
}
|
|
|
|
|
/* Reserve a virtual address range */
|
|
|
|
|
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0));
|
|
|
|
|
/* Map the virtual address range to the physical allocation */
|
|
|
|
|
CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0));
|
|
|
|
|
/* Now allow RW access to the newly mapped memory */
|
|
|
|
|
for (int i = 0; i < dcnt; ++i) {
|
|
|
|
|
int p2p = 0;
|
|
|
|
|
if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, i, cudaDev) == cudaSuccess) && p2p)) {
|
|
|
|
|
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
|
|
|
accessDesc.location.id = i;
|
|
|
|
|
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
|
|
|
|
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1));
|
|
|
|
|
}
|
|
|
|
|
if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
|
|
|
|
|
}
|
|
|
|
|
goto exit;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fallback:
|
|
|
|
|
#endif
|
|
|
|
|
// Coverity is right to complain that we may pass a NULL ptr to cudaMalloc. That's deliberate though:
|
|
|
|
|
// we want CUDA to return an error to the caller.
|
|
|
|
|
// coverity[var_deref_model]
|
|
|
|
|
CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
|
|
|
|
|
|
|
|
|
|
exit:
|
|
|
|
|
return ret;
|
|
|
|
|
fail:
|
|
|
|
|
goto exit;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
|
2025-08-28 15:45:42 -05:00
|
|
|
ncclResult_t ncclMemFree_impl(void *ptr) {
|
2025-05-29 20:56:40 -07:00
|
|
|
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
|
|
|
|
ncclResult_t ret = ncclSuccess;
|
|
|
|
|
int saveDevice;
|
|
|
|
|
|
|
|
|
|
CUDACHECK(cudaGetDevice(&saveDevice));
|
2025-08-28 15:45:42 -05:00
|
|
|
#if ROCM_VERSION >= 70000
|
2025-05-29 20:56:40 -07:00
|
|
|
CUdevice ptrDev = 0;
|
|
|
|
|
|
|
|
|
|
if (ptr == NULL) goto fallback;
|
2025-08-28 15:45:42 -05:00
|
|
|
// if (rocmLibraryInit() != ncclSuccess) goto fallback;
|
|
|
|
|
rocmLibraryInit();
|
2025-05-29 20:56:40 -07:00
|
|
|
|
|
|
|
|
CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail);
|
|
|
|
|
CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail);
|
|
|
|
|
if (ncclCuMemEnable()) {
|
|
|
|
|
NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail);
|
|
|
|
|
goto exit;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fallback:
|
|
|
|
|
#endif
|
|
|
|
|
CUDACHECKGOTO(cudaFree(ptr), ret, fail);
|
|
|
|
|
|
|
|
|
|
exit:
|
|
|
|
|
CUDACHECK(cudaSetDevice(saveDevice));
|
|
|
|
|
return ret;
|
|
|
|
|
fail:
|
|
|
|
|
goto exit;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// This is a collective function and should be called by all ranks in the communicator
|
|
|
|
|
ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr) {
|
|
|
|
|
ncclResult_t ret = ncclSuccess;
|
|
|
|
|
void* regSymAddr = NULL;
|
|
|
|
|
size_t allocSize = size;
|
|
|
|
|
size_t granularity;
|
|
|
|
|
CUdevice cuDev;
|
|
|
|
|
CUmemAllocationProp memprop = {};
|
|
|
|
|
CUmemGenericAllocationHandle memHandle;
|
|
|
|
|
int bit = 0, cnt = 0;
|
|
|
|
|
|
|
|
|
|
// aligment must be power of 2 as an input
|
|
|
|
|
while (bit < sizeof(size_t) * 8) {
|
|
|
|
|
if (alignment & (1L << bit)) cnt++;
|
|
|
|
|
if (cnt == 2) {
|
|
|
|
|
WARN("rank %d alignment %ld is not power of 2", comm->rank, alignment);
|
|
|
|
|
goto fail;
|
|
|
|
|
}
|
|
|
|
|
bit++;
|
|
|
|
|
}
|
|
|
|
|
// temporarily align the alignment to NCCL_REC_PAGE_SIZE
|
|
|
|
|
ALIGN_SIZE(alignment, NCCL_REC_PAGE_SIZE);
|
|
|
|
|
|
|
|
|
|
CUCHECKGOTO(cuDeviceGet(&cuDev, comm->cudaDev), ret, fail);
|
|
|
|
|
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
|
|
|
|
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
2025-08-28 15:45:42 -05:00
|
|
|
memprop.requestedHandleType = ncclCuMemHandleType;
|
2025-05-29 20:56:40 -07:00
|
|
|
memprop.location.id = cuDev;
|
|
|
|
|
CUCHECKGOTO(cuMemGetAllocationGranularity(&granularity, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
|
|
|
|
|
ALIGN_SIZE(allocSize, granularity);
|
|
|
|
|
|
|
|
|
|
CUCHECKGOTO(cuMemCreate(&memHandle, allocSize, &memprop, 0), ret, fail);
|
|
|
|
|
ALIGN_SIZE(comm->symAllocHead, alignment);
|
|
|
|
|
NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, allocSize, memHandle, ®SymAddr), ret, fail);
|
|
|
|
|
NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, allocSize, regSymAddr), ret, fail);
|
|
|
|
|
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
|
|
|
|
|
comm->symAllocHead += allocSize;
|
|
|
|
|
*symPtr = regSymAddr;
|
|
|
|
|
|
|
|
|
|
exit:
|
|
|
|
|
return ret;
|
|
|
|
|
fail:
|
|
|
|
|
*symPtr = NULL;
|
|
|
|
|
goto exit;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr) {
|
|
|
|
|
CUmemGenericAllocationHandle handle;
|
|
|
|
|
size_t size = 0;
|
|
|
|
|
ncclResult_t ret = ncclSuccess;
|
|
|
|
|
int saveDev = comm->cudaDev;
|
|
|
|
|
CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail);
|
|
|
|
|
if (ncclCuMemEnable()) {
|
|
|
|
|
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
|
|
|
|
|
CUCHECKGOTO(cuMemRetainAllocationHandle(&handle, symPtr), ret, fail);
|
|
|
|
|
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
|
|
|
|
|
CUCHECKGOTO(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)symPtr), ret, fail);
|
|
|
|
|
NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, size, symPtr), ret, fail);
|
|
|
|
|
NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, size, symPtr), ret, fail);
|
|
|
|
|
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
|
|
|
|
|
}
|
|
|
|
|
exit:
|
|
|
|
|
CUDACHECK(cudaSetDevice(saveDev));
|
|
|
|
|
return ret;
|
|
|
|
|
fail:
|
|
|
|
|
goto exit;
|
|
|
|
|
}
|