[ROCm/rccl commit: f60b76c67a]
Этот коммит содержится в:
Wenkai Du
2021-03-06 20:32:30 -08:00
коммит произвёл GitHub
родитель b7253710ca
Коммит 7374c512d9
3 изменённых файлов: 25 добавлений и 1 удалений
+2
Просмотреть файл
@@ -15,6 +15,7 @@ thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
thread_local int ncclGroupIndex = 0;
thread_local int ncclGroupMode = 0;
thread_local ncclResult_t ncclGroupError = ncclSuccess;
extern struct allocationTracker allocTracker[];
bool ncclAsyncMode() {
return ncclGroupMode > 0;
@@ -205,6 +206,7 @@ ncclResult_t ncclGroupEnd() {
WARN("Error waiting for pthread_join : %s", strerror(errno));
return ncclSystemError;
}
INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
NCCLCHECKGOTO(args->ret, ret, end);
args->coll.comm->connect = 0;
}
+19
Просмотреть файл
@@ -37,6 +37,19 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
return ncclSuccess;
}
struct __attribute__ ((aligned(64))) allocationTracker {
union {
struct {
uint64_t totalAlloc;
uint64_t totalAllocSize;
};
char align[64];
};
};
static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be size of 64 bytes");
#define MAX_ALLOC_TRACK_NGPU 32
extern struct allocationTracker allocTracker[];
template <typename T>
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
if (isFineGrain)
@@ -44,6 +57,12 @@ static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = fal
else
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
int dev;
CUDACHECK(hipGetDevice(&dev));
if (dev < MAX_ALLOC_TRACK_NGPU) {
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_SEQ_CST);
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_SEQ_CST);
}
return ncclSuccess;
}
+4 -1
Просмотреть файл
@@ -58,6 +58,8 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
ncclNet_t* ncclNet = NULL;
ncclCollNet_t* ncclCollNet = NULL;
struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
// Returns ncclInternalError if anything fails, causing that network to be ignored.
ncclResult_t initNet(ncclNet_t* net) {
int ndev;
@@ -1141,7 +1143,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, allocTracker[(*newcomm)->cudaDev].totalAllocSize);
return ncclSuccess;
cleanup:
@@ -1161,6 +1163,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
NCCLCHECKGOTO(ncclInit(), res, end);
if (myrank == 0) showVersion();
memset(allocTracker+cudaDev, 0, sizeof(struct allocationTracker));
// Make sure the CUDA runtime is initialized.
CUDACHECKGOTO(hipFree(NULL), res, end);