From 7374c512d95ce889307f7e284221bb53ffa9fd89 Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Sat, 6 Mar 2021 20:32:30 -0800
Subject: [PATCH] Add GPU memory usage tracker (#326)

[ROCm/rccl commit: f60b76c67a5e550b6c30b677fcb314c3b8c7a6ac]
---
 projects/rccl/src/group.cc        |  2 ++
 projects/rccl/src/include/alloc.h | 19 +++++++++++++++++++
 projects/rccl/src/init.cc         |  5 ++++-
 3 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/projects/rccl/src/group.cc b/projects/rccl/src/group.cc
index 3ab95c0533..56dc892e0d 100644
--- a/projects/rccl/src/group.cc
+++ b/projects/rccl/src/group.cc
@@ -15,6 +15,7 @@ thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
 thread_local int ncclGroupIndex = 0;
 thread_local int ncclGroupMode = 0;
 thread_local ncclResult_t ncclGroupError = ncclSuccess;
+extern struct allocationTracker allocTracker[];
 
 bool ncclAsyncMode() {
   return ncclGroupMode > 0;
@@ -205,6 +206,7 @@ ncclResult_t ncclGroupEnd() {
         WARN("Error waiting for pthread_join : %s", strerror(errno));
         return ncclSystemError;
       }
+      INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
       NCCLCHECKGOTO(args->ret, ret, end);
       args->coll.comm->connect = 0;
     }
diff --git a/projects/rccl/src/include/alloc.h b/projects/rccl/src/include/alloc.h
index b2af04a1c1..27a9ab7761 100644
--- a/projects/rccl/src/include/alloc.h
+++ b/projects/rccl/src/include/alloc.h
@@ -37,6 +37,19 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
   return ncclSuccess;
 }
 
+struct __attribute__ ((aligned(64))) allocationTracker {
+  union {
+    struct {
+      uint64_t totalAlloc;
+      uint64_t totalAllocSize;
+    };
+    char align[64];
+  };
+};
+static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be size of 64 bytes");
+#define MAX_ALLOC_TRACK_NGPU 32
+extern struct allocationTracker allocTracker[];
+
 template <typename T>
 static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
   if (isFineGrain)
@@ -44,6 +57,12 @@ static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = fal
   else
     CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
   CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
+  int dev;
+  CUDACHECK(hipGetDevice(&dev));
+  if (dev < MAX_ALLOC_TRACK_NGPU) {
+    __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_SEQ_CST);
+    __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_SEQ_CST);
+  }
   return ncclSuccess;
 }
 
diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc
index 3ebe870465..1ccbd83c8e 100644
--- a/projects/rccl/src/init.cc
+++ b/projects/rccl/src/init.cc
@@ -58,6 +58,8 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
 ncclNet_t* ncclNet = NULL;
 ncclCollNet_t* ncclCollNet = NULL;
 
+struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
+
 // Returns ncclInternalError if anything fails, causing that network to be ignored.
 ncclResult_t initNet(ncclNet_t* net) {
   int ndev;
@@ -1141,7 +1143,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
   NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
   NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
 
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, allocTracker[(*newcomm)->cudaDev].totalAllocSize);
 
   return ncclSuccess;
 cleanup:
@@ -1161,6 +1163,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
   NCCLCHECKGOTO(ncclInit(), res, end);
   if (myrank == 0) showVersion();
 
+  memset(allocTracker+cudaDev, 0, sizeof(struct allocationTracker));
   // Make sure the CUDA runtime is initialized.
   CUDACHECKGOTO(hipFree(NULL), res, end);