Add GPU memory usage tracker (#326)

[ROCm/rccl commit: f60b76c67a]
2021-03-06 20:32:30 -08:00
@@ -15,6 +15,7 @@ thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
 thread_local int ncclGroupIndex = 0;
 thread_local int ncclGroupMode = 0;
 thread_local ncclResult_t ncclGroupError = ncclSuccess;
+extern struct allocationTracker allocTracker[];

 bool ncclAsyncMode() {
  return ncclGroupMode > 0;
@@ -205,6 +206,7 @@ ncclResult_t ncclGroupEnd() {
        WARN("Error waiting for pthread_join : %s", strerror(errno));
        return ncclSystemError;
      }
+      INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
      NCCLCHECKGOTO(args->ret, ret, end);
      args->coll.comm->connect = 0;
    }
@@ -37,6 +37,19 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
  return ncclSuccess;
 }

+struct __attribute__ ((aligned(64))) allocationTracker {
+  union {
+    struct {
+      uint64_t totalAlloc;
+      uint64_t totalAllocSize;
+    };
+    char align[64];
+  };
+};
+static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be size of 64 bytes");
+#define MAX_ALLOC_TRACK_NGPU 32
+extern struct allocationTracker allocTracker[];
+
 template <typename T>
 static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
  if (isFineGrain)
@@ -44,6 +57,12 @@ static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = fal
  else
    CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
  CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
+  int dev;
+  CUDACHECK(hipGetDevice(&dev));
+  if (dev < MAX_ALLOC_TRACK_NGPU) {
+    __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_SEQ_CST);
+    __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_SEQ_CST);
+  }
  return ncclSuccess;
 }

@@ -58,6 +58,8 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
 ncclNet_t* ncclNet = NULL;
 ncclCollNet_t* ncclCollNet = NULL;

+struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
+
 // Returns ncclInternalError if anything fails, causing that network to be ignored.
 ncclResult_t initNet(ncclNet_t* net) {
  int ndev;
@@ -1141,7 +1143,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);

-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, allocTracker[(*newcomm)->cudaDev].totalAllocSize);

  return ncclSuccess;
 cleanup:
@@ -1161,6 +1163,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
  NCCLCHECKGOTO(ncclInit(), res, end);
  if (myrank == 0) showVersion();

+  memset(allocTracker+cudaDev, 0, sizeof(struct allocationTracker));
  // Make sure the CUDA runtime is initialized.
  CUDACHECKGOTO(hipFree(NULL), res, end);