From 7374c512d95ce889307f7e284221bb53ffa9fd89 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Sat, 6 Mar 2021 20:32:30 -0800 Subject: [PATCH] Add GPU memory usage tracker (#326) [ROCm/rccl commit: f60b76c67a5e550b6c30b677fcb314c3b8c7a6ac] --- projects/rccl/src/group.cc | 2 ++ projects/rccl/src/include/alloc.h | 19 +++++++++++++++++++ projects/rccl/src/init.cc | 5 ++++- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/projects/rccl/src/group.cc b/projects/rccl/src/group.cc index 3ab95c0533..56dc892e0d 100644 --- a/projects/rccl/src/group.cc +++ b/projects/rccl/src/group.cc @@ -15,6 +15,7 @@ thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS]; thread_local int ncclGroupIndex = 0; thread_local int ncclGroupMode = 0; thread_local ncclResult_t ncclGroupError = ncclSuccess; +extern struct allocationTracker allocTracker[]; bool ncclAsyncMode() { return ncclGroupMode > 0; @@ -205,6 +206,7 @@ ncclResult_t ncclGroupEnd() { WARN("Error waiting for pthread_join : %s", strerror(errno)); return ncclSystemError; } + INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize); NCCLCHECKGOTO(args->ret, ret, end); args->coll.comm->connect = 0; } diff --git a/projects/rccl/src/include/alloc.h b/projects/rccl/src/include/alloc.h index b2af04a1c1..27a9ab7761 100644 --- a/projects/rccl/src/include/alloc.h +++ b/projects/rccl/src/include/alloc.h @@ -37,6 +37,19 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { return ncclSuccess; } +struct __attribute__ ((aligned(64))) allocationTracker { + union { + struct { + uint64_t totalAlloc; + uint64_t totalAllocSize; + }; + char align[64]; + }; +}; +static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be size of 64 bytes"); +#define MAX_ALLOC_TRACK_NGPU 32 +extern struct allocationTracker allocTracker[]; + template static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) { if (isFineGrain) @@ -44,6 +57,12 @@ static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = fal else CUDACHECK(hipMalloc(ptr, nelem*sizeof(T))); CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T))); + int dev; + CUDACHECK(hipGetDevice(&dev)); + if (dev < MAX_ALLOC_TRACK_NGPU) { + __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_SEQ_CST); + __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_SEQ_CST); + } return ncclSuccess; } diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 3ebe870465..1ccbd83c8e 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -58,6 +58,8 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); ncclNet_t* ncclNet = NULL; ncclCollNet_t* ncclCollNet = NULL; +struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {}; + // Returns ncclInternalError if anything fails, causing that network to be ignored. ncclResult_t initNet(ncclNet_t* net) { int ndev; @@ -1141,7 +1143,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, allocTracker[(*newcomm)->cudaDev].totalAllocSize); return ncclSuccess; cleanup: @@ -1161,6 +1163,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni NCCLCHECKGOTO(ncclInit(), res, end); if (myrank == 0) showVersion(); + memset(allocTracker+cudaDev, 0, sizeof(struct allocationTracker)); // Make sure the CUDA runtime is initialized. CUDACHECKGOTO(hipFree(NULL), res, end);