@@ -15,6 +15,7 @@ thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
|
||||
thread_local int ncclGroupIndex = 0;
|
||||
thread_local int ncclGroupMode = 0;
|
||||
thread_local ncclResult_t ncclGroupError = ncclSuccess;
|
||||
extern struct allocationTracker allocTracker[];
|
||||
|
||||
bool ncclAsyncMode() {
|
||||
return ncclGroupMode > 0;
|
||||
@@ -205,6 +206,7 @@ ncclResult_t ncclGroupEnd() {
|
||||
WARN("Error waiting for pthread_join : %s", strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
|
||||
NCCLCHECKGOTO(args->ret, ret, end);
|
||||
args->coll.comm->connect = 0;
|
||||
}
|
||||
|
||||
@@ -37,6 +37,19 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct __attribute__ ((aligned(64))) allocationTracker {
|
||||
union {
|
||||
struct {
|
||||
uint64_t totalAlloc;
|
||||
uint64_t totalAllocSize;
|
||||
};
|
||||
char align[64];
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be size of 64 bytes");
|
||||
#define MAX_ALLOC_TRACK_NGPU 32
|
||||
extern struct allocationTracker allocTracker[];
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
|
||||
if (isFineGrain)
|
||||
@@ -44,6 +57,12 @@ static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = fal
|
||||
else
|
||||
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
|
||||
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
if (dev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_SEQ_CST);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_SEQ_CST);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -58,6 +58,8 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
|
||||
ncclNet_t* ncclNet = NULL;
|
||||
ncclCollNet_t* ncclCollNet = NULL;
|
||||
|
||||
struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
|
||||
|
||||
// Returns ncclInternalError if anything fails, causing that network to be ignored.
|
||||
ncclResult_t initNet(ncclNet_t* net) {
|
||||
int ndev;
|
||||
@@ -1141,7 +1143,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
|
||||
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
|
||||
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
|
||||
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, allocTracker[(*newcomm)->cudaDev].totalAllocSize);
|
||||
|
||||
return ncclSuccess;
|
||||
cleanup:
|
||||
@@ -1161,6 +1163,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
|
||||
NCCLCHECKGOTO(ncclInit(), res, end);
|
||||
if (myrank == 0) showVersion();
|
||||
|
||||
memset(allocTracker+cudaDev, 0, sizeof(struct allocationTracker));
|
||||
// Make sure the CUDA runtime is initialized.
|
||||
CUDACHECKGOTO(hipFree(NULL), res, end);
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user