From df448862c3692c1732d2fd87d3b626d1b85f840d Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Wed, 27 Aug 2025 09:30:51 -0500 Subject: [PATCH] Device allocation tracker (#1878) * alloc: add memory allocation tracker * alloc: add tracker for ncclCuMemAlloc() APIs * alloc: add null pointer check during free --- src/include/alloc.h | 95 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 81 insertions(+), 14 deletions(-) diff --git a/src/include/alloc.h b/src/include/alloc.h index 9a840138f2..bc1c3ad2d6 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -190,7 +190,7 @@ struct __attribute__ ((aligned(64))) allocationTracker { }; }; static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be size of 64 bytes"); -#define MAX_ALLOC_TRACK_NGPU 32 +#define MAX_ALLOC_TRACK_NGPU 128 extern struct allocationTracker allocTracker[]; #if CUDART_VERSION >= 11030 @@ -219,6 +219,11 @@ static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocation accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); TRACE(NCCL_ALLOC, "CuMem Map Size %zu pointer %p handle %llx", size, *ptr, *handleIn); + if (cudaDev < MAX_ALLOC_TRACK_NGPU) { + __atomic_fetch_add(&allocTracker[cudaDev].totalAlloc, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&allocTracker[cudaDev].totalAllocSize, size, __ATOMIC_RELAXED); + } + INFO(NCCL_ALLOC, "ncclCuMemAllocAddr: Memory used = %ld on device = %d", allocTracker[cudaDev].totalAllocSize, cudaDev); return result; } @@ -229,6 +234,15 @@ static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) { CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size)); CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + + int dev; + size *= -1; + CUDACHECK(hipGetDevice(&dev)); + if (dev < MAX_ALLOC_TRACK_NGPU) { + __atomic_fetch_add(&allocTracker[dev].totalAlloc, -1, __ATOMIC_RELAXED); + __atomic_fetch_add(&allocTracker[dev].totalAllocSize, size, __ATOMIC_RELAXED); + } + INFO(NCCL_ALLOC, "ncclCuMemFreeAddr: Memory used = %ld on device = %d", allocTracker[dev].totalAllocSize, dev); return result; } @@ -265,6 +279,13 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); if (handlep) *handlep = handle; TRACE(NCCL_ALLOC, "CuMem Alloc Size %zu pointer %p handle %llx", size, *ptr, handle); + + if (cudaDev < MAX_ALLOC_TRACK_NGPU) { + __atomic_fetch_add(&allocTracker[cudaDev].totalAlloc, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&allocTracker[cudaDev].totalAllocSize, size, __ATOMIC_RELAXED); + } + INFO(NCCL_ALLOC, "ncclCuMemAlloc: Memory used = %ld on device = %d", allocTracker[cudaDev].totalAllocSize, cudaDev); + return result; } @@ -280,6 +301,15 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) { CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size)); CUCHECK(cuMemRelease(handle)); CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + + int dev; + size *= -1; + CUDACHECK(hipGetDevice(&dev)); + if (dev < MAX_ALLOC_TRACK_NGPU) { + __atomic_fetch_add(&allocTracker[dev].totalAlloc, -1, __ATOMIC_RELAXED); + __atomic_fetch_add(&allocTracker[dev].totalAllocSize, size, __ATOMIC_RELAXED); + } + INFO(NCCL_ALLOC, "ncclCuMemFree: Memory used = %ld on device = %d", allocTracker[dev].totalAllocSize, dev); return result; } @@ -318,6 +348,15 @@ ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA malloc %ld bytes", nelem*ncclSizeOfT()); + else { + int dev; + CUDACHECK(hipGetDevice(&dev)); + if (dev < MAX_ALLOC_TRACK_NGPU) { + __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*ncclSizeOfT(), __ATOMIC_RELAXED); + } + INFO(NCCL_ALLOC, "ncclCudaMallocDebug: Memory used = %ld on device = %d", allocTracker[dev].totalAllocSize, dev); + } INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p flags %d", filefunc, line, nelem*ncclSizeOfT(), *ptr, flags); return result; } @@ -328,6 +367,8 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; + int dev; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); // Need a side stream so as not to interfere with graph capture. cudaStream_t stream = sideStream; @@ -338,15 +379,17 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); if (sideStream == nullptr) CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); - int dev; - CUDACHECK(hipGetDevice(&dev)); - if (dev < MAX_ALLOC_TRACK_NGPU) { - __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED); - __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*ncclSizeOfT(), __ATOMIC_RELAXED); - } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc %ld bytes", nelem*ncclSizeOfT()); + else { + CUDACHECK(hipGetDevice(&dev)); + if (dev < MAX_ALLOC_TRACK_NGPU) { + __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*ncclSizeOfT(), __ATOMIC_RELAXED); + } + INFO(NCCL_ALLOC, "ncclCudaCallocDebug: Memory used = %ld on device = %d", allocTracker[dev].totalAllocSize, dev); + } INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p flags %d", filefunc, line, nelem*ncclSizeOfT(), *ptr, flags); return result; } @@ -357,20 +400,24 @@ ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, s ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; + int dev; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (nelem > 0) { CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT(), flags), result, finish); - CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT(), stream), result, finish); - int dev; - CUDACHECK(hipGetDevice(&dev)); - if (dev < MAX_ALLOC_TRACK_NGPU) { - __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED); - __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*ncclSizeOfT(), __ATOMIC_RELAXED); - } + CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT(), stream), result, finish); } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc async %ld bytes", nelem*ncclSizeOfT()); + else { + CUDACHECK(hipGetDevice(&dev)); + if (dev < MAX_ALLOC_TRACK_NGPU) { + __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*ncclSizeOfT(), __ATOMIC_RELAXED); + } + INFO(NCCL_ALLOC, "ncclCudaCallocDebug: Memory used = %ld on device = %d", allocTracker[dev].totalAllocSize, dev); + } INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p flags %d", filefunc, line, nelem*ncclSizeOfT(), *ptr, flags); return result; } @@ -408,6 +455,26 @@ ncclResult_t ncclCudaFree(T* ptr) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr); + + // get the size of the allocation + if (ptr != NULL) { + CUdeviceptr baseAddress; + size_t retrievedSize; + + CUDACHECK(cuMemGetAddressRange(&baseAddress, &retrievedSize, ptr)); + retrievedSize *= -1; + + if (ptr == baseAddress) { + int dev; + CUDACHECK(hipGetDevice(&dev)); + if (dev < MAX_ALLOC_TRACK_NGPU) { + __atomic_fetch_add(&allocTracker[dev].totalAlloc, -1, __ATOMIC_RELAXED); + __atomic_fetch_add(&allocTracker[dev].totalAllocSize, retrievedSize, __ATOMIC_RELAXED); + } + INFO(NCCL_ALLOC, "ncclCudaFree: Memory used = %ld on device = %d", allocTracker[dev].totalAllocSize, dev); + } + } + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (ncclCuMemEnable()) { NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish);