From ed237dcaa78687fe4c109f67198aaece58b5cede Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Mon, 26 Apr 2021 08:35:53 -0700 Subject: [PATCH] Use better name for kernel collective trace enable (#357) "NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=INIT,COLL" enables collectives API trace. Adding "RCCL_KERNEL_COLL_TRACE_ENABLE=1" enables kernel traces. --- src/init.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/init.cc b/src/init.cc index 359dac875b..80de771fe0 100644 --- a/src/init.cc +++ b/src/init.cc @@ -165,14 +165,12 @@ void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) { comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1; } -RCCL_PARAM(KernelCollEnable, "KERNEL_COLL_ENABLE", 0); -RCCL_PARAM(KernelCollPrint, "KERNEL_COLL_PRINT", 0); +RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0); #ifdef ENABLE_COLLTRACE void *ncclCommThreadMain(void *arg) { ncclComm_t comm = (ncclComm_t)arg; int head = comm->hostDevComm.collTraceHead; - bool kern_print = rcclParamKernelCollPrint() && rcclParamKernelCollEnable(); #define MAX_NAME_LENGTH 32 char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+1)); for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) { @@ -256,7 +254,7 @@ void *ncclCommThreadMain(void *arg) { break; } } - if (kern_print) INFO(NCCL_INIT, "%s", line); + INFO(NCCL_COLL, "%s", line); STORE(&(td->type), ncclCollTraceNotReady); head ++; head %= COLLTRACE_NUM_ITEMS; @@ -420,7 +418,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { NCCLCHECK(ncclCudaHostCalloc(&comm->hostDevComm.collTrace, COLLTRACE_NUM_ITEMS)); memset(comm->hostDevComm.collTrace, 0, sizeof(struct ncclCollTrace) * COLLTRACE_NUM_ITEMS); comm->hostDevComm.collTraceExit = comm->hostDevComm.collTraceHead = *comm->hostDevComm.collTraceTail = 0; - if ((ncclDebugLevel >= NCCL_LOG_INFO) && rcclParamKernelCollEnable()) + if ((ncclDebugLevel >= NCCL_LOG_INFO) && rcclParamKernelCollTraceEnable()) pthread_create(&comm->hostDevComm.collTraceThread, NULL, ncclCommThreadMain, (void *)comm); else comm->hostDevComm.collTraceThread = 0;