From d4382de26724816f07fc949e5e4c0d84910c18a3 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Tue, 22 Dec 2020 13:28:21 -0500 Subject: [PATCH] Improve collective trace [ROCm/rccl commit: 2ddbe6646b3a6613bd9ca1ae3e3e9b2a14c46fe6] --- projects/rccl/src/collectives/device/common.h | 24 +++++-- projects/rccl/src/include/collectives.h | 3 + projects/rccl/src/include/devcomm.h | 13 +++- projects/rccl/src/init.cc | 63 ++++++++++++++----- projects/rccl/tools/topo_expl/utils.cpp | 2 +- 5 files changed, 83 insertions(+), 22 deletions(-) diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h index 9ce25fa656..d96e2e307b 100644 --- a/projects/rccl/src/collectives/device/common.h +++ b/projects/rccl/src/collectives/device/common.h @@ -117,6 +117,8 @@ struct Caller{ void call(struct ncclWorkElem* const c) noexcept { ncclFuncs[f](c); } }; +static_assert(FUNC_INDEX_P2P == 1800, "Wrong P2P function index"); + inline __device__ void NCCL_CALL_FUNCTIONS(struct ncclWorkElem* const c) noexcept { @@ -177,7 +179,19 @@ class ncclFunction { comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \ comm->collTrace[pos].opCount = w->opCount; \ comm->collTrace[pos].bid = bid; \ - comm->collTrace[pos].funcIndex = fIdx; + comm->collTrace[pos].funcIndex = fIdx; \ + if (fIdx == FUNC_INDEX_P2P) { \ + comm->collTrace[pos].p2p.nThreads = w->p2p.nThreads; \ + comm->collTrace[pos].p2p.delta = (uint16_t)(w->p2p.delta); \ + } else if (fIdx == FUNC_INDEX_A2AV) { \ + comm->collTrace[pos].coll.nThreads = w->nThreads; \ + comm->collTrace[pos].coll.bid = w->a2av.bid; \ + comm->collTrace[pos].coll.nChannels = w->a2av.nChannels; \ + } else { \ + comm->collTrace[pos].coll.nThreads = w->nThreads; \ + comm->collTrace[pos].coll.bid = w->coll.bid; \ + comm->collTrace[pos].coll.nChannels = w->coll.nChannels; \ + } #define traceKernelLaunch(fIdx) { \ traceColl(fIdx); \ asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (comm->collTrace[pos].data_0)); \ @@ -262,7 +276,7 @@ __device__ void ncclKernel(struct ncclWorkElem first) { if (w == NULL) { w = shmem.localWork.elems; if (!load_coll(&shmem.localWork, channel->workFifo+index, tid, comm, &abortCount)) { - if (COLLTRACE && tid == 0) traceAbort(-1); + if (COLLTRACE && tid == 0) traceAbort(0xffff); return; } if (COLLTRACE && tid == 0) { @@ -270,11 +284,9 @@ __device__ void ncclKernel(struct ncclWorkElem first) { if (!firstLaunch) traceCollEnd(w->funcIndex); firstLaunch = false; } - } else { - if (COLLTRACE && tid == 0) { + } else if (COLLTRACE && tid == 0) { traceKernelLaunch(w->funcIndex); firstLaunch = false; - } } if (tid < w->nThreads) { if (w->funcIndex == FINDEX) { @@ -285,7 +297,7 @@ __device__ void ncclKernel(struct ncclWorkElem first) { } index = (index+1) % NCCL_MAX_OPS; if (w->active == 2) { - if (COLLTRACE && tid == 0) traceCollEnd(-1); + if (COLLTRACE && tid == 0) traceCollEnd(0xffff); return; } w = NULL; diff --git a/projects/rccl/src/include/collectives.h b/projects/rccl/src/include/collectives.h index 6b905385fd..eef7a3cd16 100644 --- a/projects/rccl/src/include/collectives.h +++ b/projects/rccl/src/include/collectives.h @@ -9,6 +9,9 @@ #define NCCL_COLLECTIVES_H_ #define FUNC_INDEX_P2P (NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) +#define FUNC_INDEX_A2A (FUNC_INDEX_P2P+1) +#define FUNC_INDEX_A2AV (FUNC_INDEX_P2P+2) + #define FUNC_INDEX(func, redop, ncclType, al, pr) ((func >= NCCL_NUM_FUNCTIONS) \ ? (func-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \ : ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))) diff --git a/projects/rccl/src/include/devcomm.h b/projects/rccl/src/include/devcomm.h index 341ddb2862..7fa6842b8d 100644 --- a/projects/rccl/src/include/devcomm.h +++ b/projects/rccl/src/include/devcomm.h @@ -286,7 +286,18 @@ struct ncclCollTrace { uint32_t data_0; uint64_t timeStamp; uint64_t opCount; - uint64_t data_1; + union { + uint64_t data_1; + struct { + uint16_t nThreads; + uint8_t bid; + uint8_t nChannels; + } coll; + struct { + uint16_t nThreads; + uint16_t delta; + } p2p; + }; }; static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size"); diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 734b992f89..421ceeda07 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -41,9 +41,11 @@ std::chrono::high_resolution_clock::time_point ncclEpoch; #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream #endif -const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "AllToAll", "AllToAllv" }; +const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+3] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAll", "AllToAllv" }; const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" }; const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; +const char* ncclRedOpStr[ncclNumOps] = { "Sum", "Prod", "Max", "Min" }; +const char *ncclTypeStr[ncclNumTypes] = {"_i8", "_u8", "_i32", "_u32", "_i64", "_u64", "_f16", "_f32", "_f64", "_b16"}; NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); @@ -161,6 +163,25 @@ void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) { void *ncclCommThreadMain(void *arg) { ncclComm_t comm = (ncclComm_t)arg; int head = comm->hostDevComm.collTraceHead; + #define MAX_NAME_LENGTH 32 + char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_A2AV+1)); + for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) { + for (int al = 0; al < NCCL_NUM_ALGORITHMS; al++) { + for (int type = 0; type < ncclNumTypes; type++) { + for (int pr = 0; pr < NCCL_NUM_PROTOCOLS; pr++) { + for (int redop = 0; redop < ncclNumOps; redop++) { + char* line = func_names+MAX_NAME_LENGTH*FUNC_INDEX(func, redop, type, al, pr); + sprintf(line, "%s%s%s%s%s", ncclFuncStr[func], ncclAlgoStr[al], ncclProtoStr[pr], + ncclRedOpStr[redop], ncclTypeStr[type]); + } + } + } + } + } + for (int func = NCCL_NUM_FUNCTIONS; func < NCCL_NUM_FUNCTIONS+3; func++) { + char* line = func_names+MAX_NAME_LENGTH*(FUNC_INDEX_P2P+func-NCCL_NUM_FUNCTIONS); + sprintf(line, "%s", ncclFuncStr[func]); + } do { int tail = LOAD(comm->hostDevComm.collTraceTail)%COLLTRACE_NUM_ITEMS; int count; @@ -177,32 +198,45 @@ void *ncclCommThreadMain(void *arg) { } } for (int i = 0; i < count; i++) { - uint8_t type = LOAD(&(comm->hostDevComm.collTrace[head].type)); + struct ncclCollTrace *td = comm->hostDevComm.collTrace+head; + uint8_t type = LOAD(&(td->type)); if (type == ncclCollTraceNotReady) break; char line[1024]; int offset = 0; + uint16_t fIdx = td->funcIndex; #define VEGA_GPU_RTC_FREQUENCY 2.5E7 if (type == ncclCollTraceDataType) { sprintf(line, "## [%12.6f] [%02d:%02d] L:%04d DT %08x %016lx %016lx", - (double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid, - comm->hostDevComm.collTrace[head].funcIndex, - comm->hostDevComm.collTrace[head].data_0, - comm->hostDevComm.collTrace[head].opCount, - comm->hostDevComm.collTrace[head].data_1); + (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, + fIdx, td->data_0, td->opCount, td->data_1); } else { sprintf(line, "## [%12.6f] [%02d:%02d] %06lx", - (double)(comm->hostDevComm.collTrace[head].timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, comm->hostDevComm.collTrace[head].bid, comm->hostDevComm.collTrace[head].opCount); + (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->opCount); offset = strlen(line); switch (type) { case ncclCollTraceKernelLaunchType: - sprintf(line+offset, " KL hwid %8x funcIndex %d", - comm->hostDevComm.collTrace[head].data_0, comm->hostDevComm.collTrace[head].funcIndex); + sprintf(line+offset, " KL HWID %8x %s ", + td->data_0, func_names+MAX_NAME_LENGTH*fIdx); + offset = strlen(line); + if (fIdx > FUNC_INDEX_A2AV) + sprintf(line+offset, "ERROR bad function index %d", fIdx); + else if (fIdx == FUNC_INDEX_P2P) + sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta); + else + sprintf(line+offset, "nt %d bi %d nc %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels); break; case ncclCollTraceCollEndType: - if (comm->hostDevComm.collTrace[head].funcIndex != -1) - sprintf(line+offset, " CE next funcIndex %d", - comm->hostDevComm.collTrace[head].funcIndex); + if (fIdx != 0xffff) { + sprintf(line+offset, " CE %s ", func_names+MAX_NAME_LENGTH*fIdx); + offset = strlen(line); + if (fIdx > FUNC_INDEX_A2AV) + sprintf(line+offset, "ERROR bad function index %d", fIdx); + else if (fIdx == FUNC_INDEX_P2P) + sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta); + else + sprintf(line+offset, "nt %d bi %d nc %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels); + } else sprintf(line+offset, " KE"); break; @@ -215,11 +249,12 @@ void *ncclCommThreadMain(void *arg) { } } INFO(NCCL_COLL, "%s", line); - STORE(&(comm->hostDevComm.collTrace[head].type), ncclCollTraceNotReady); + STORE(&(td->type), ncclCollTraceNotReady); head ++; head %= COLLTRACE_NUM_ITEMS; } } while(1); + free(func_names); comm->hostDevComm.collTraceHead = head; pthread_exit(NULL); } diff --git a/projects/rccl/tools/topo_expl/utils.cpp b/projects/rccl/tools/topo_expl/utils.cpp index ca216eee81..e5bcf768fd 100644 --- a/projects/rccl/tools/topo_expl/utils.cpp +++ b/projects/rccl/tools/topo_expl/utils.cpp @@ -30,7 +30,7 @@ #include "model.h" #include "utils.h" -const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+1] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "AllToAll" }; +const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+3] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAll", "AllToAllv" }; const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" }; const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };