Device allocation tracker (#1878)
* alloc: add memory allocation tracker * alloc: add tracker for ncclCuMemAlloc() APIs * alloc: add null pointer check during free
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
c9becd89cd
Коммит
df448862c3
+81
-14
@@ -190,7 +190,7 @@ struct __attribute__ ((aligned(64))) allocationTracker {
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be size of 64 bytes");
|
||||
#define MAX_ALLOC_TRACK_NGPU 32
|
||||
#define MAX_ALLOC_TRACK_NGPU 128
|
||||
extern struct allocationTracker allocTracker[];
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
@@ -219,6 +219,11 @@ static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocation
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
|
||||
TRACE(NCCL_ALLOC, "CuMem Map Size %zu pointer %p handle %llx", size, *ptr, *handleIn);
|
||||
if (cudaDev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[cudaDev].totalAlloc, 1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[cudaDev].totalAllocSize, size, __ATOMIC_RELAXED);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "ncclCuMemAllocAddr: Memory used = %ld on device = %d", allocTracker[cudaDev].totalAllocSize, cudaDev);
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -229,6 +234,15 @@ static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
|
||||
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
|
||||
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
|
||||
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
|
||||
|
||||
int dev;
|
||||
size *= -1;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
if (dev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, -1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, size, __ATOMIC_RELAXED);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "ncclCuMemFreeAddr: Memory used = %ld on device = %d", allocTracker[dev].totalAllocSize, dev);
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -265,6 +279,13 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
|
||||
if (handlep) *handlep = handle;
|
||||
TRACE(NCCL_ALLOC, "CuMem Alloc Size %zu pointer %p handle %llx", size, *ptr, handle);
|
||||
|
||||
if (cudaDev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[cudaDev].totalAlloc, 1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[cudaDev].totalAllocSize, size, __ATOMIC_RELAXED);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "ncclCuMemAlloc: Memory used = %ld on device = %d", allocTracker[cudaDev].totalAllocSize, cudaDev);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -280,6 +301,15 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
||||
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
|
||||
CUCHECK(cuMemRelease(handle));
|
||||
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
|
||||
|
||||
int dev;
|
||||
size *= -1;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
if (dev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, -1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, size, __ATOMIC_RELAXED);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "ncclCuMemFree: Memory used = %ld on device = %d", allocTracker[dev].totalAllocSize, dev);
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -318,6 +348,15 @@ ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA malloc %ld bytes", nelem*ncclSizeOfT<T>());
|
||||
else {
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
if (dev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*ncclSizeOfT<T>(), __ATOMIC_RELAXED);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "ncclCudaMallocDebug: Memory used = %ld on device = %d", allocTracker[dev].totalAllocSize, dev);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p flags %d", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr, flags);
|
||||
return result;
|
||||
}
|
||||
@@ -328,6 +367,8 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
int dev;
|
||||
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
// Need a side stream so as not to interfere with graph capture.
|
||||
cudaStream_t stream = sideStream;
|
||||
@@ -338,15 +379,17 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t
|
||||
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
|
||||
if (sideStream == nullptr)
|
||||
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
if (dev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*ncclSizeOfT<T>(), __ATOMIC_RELAXED);
|
||||
}
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc %ld bytes", nelem*ncclSizeOfT<T>());
|
||||
else {
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
if (dev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*ncclSizeOfT<T>(), __ATOMIC_RELAXED);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "ncclCudaCallocDebug: Memory used = %ld on device = %d", allocTracker[dev].totalAllocSize, dev);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p flags %d", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr, flags);
|
||||
return result;
|
||||
}
|
||||
@@ -357,20 +400,24 @@ ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, s
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
int dev;
|
||||
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (nelem > 0) {
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT<T>(), flags), result, finish);
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
if (dev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*ncclSizeOfT<T>(), __ATOMIC_RELAXED);
|
||||
}
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
|
||||
}
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc async %ld bytes", nelem*ncclSizeOfT<T>());
|
||||
else {
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
if (dev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*ncclSizeOfT<T>(), __ATOMIC_RELAXED);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "ncclCudaCallocDebug: Memory used = %ld on device = %d", allocTracker[dev].totalAllocSize, dev);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p flags %d", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr, flags);
|
||||
return result;
|
||||
}
|
||||
@@ -408,6 +455,26 @@ ncclResult_t ncclCudaFree(T* ptr) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr);
|
||||
|
||||
// get the size of the allocation
|
||||
if (ptr != NULL) {
|
||||
CUdeviceptr baseAddress;
|
||||
size_t retrievedSize;
|
||||
|
||||
CUDACHECK(cuMemGetAddressRange(&baseAddress, &retrievedSize, ptr));
|
||||
retrievedSize *= -1;
|
||||
|
||||
if (ptr == baseAddress) {
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
if (dev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, -1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, retrievedSize, __ATOMIC_RELAXED);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "ncclCudaFree: Memory used = %ld on device = %d", allocTracker[dev].totalAllocSize, dev);
|
||||
}
|
||||
}
|
||||
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (ncclCuMemEnable()) {
|
||||
NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish);
|
||||
|
||||
Ссылка в новой задаче
Block a user