Summary: NCCL_DEBUG_FILE does not work properly since the recent v2.13.4 updates (https://github.com/NVIDIA/nccl/pull/682) because it nows sets `ncclDebugLevel` after parse `NCCL_DEBUG_FILE`. This patch move parsing `tempNcclDebugLevel` before processing `NCCL_DEBUG_FILE` to ensure `NCCL_DEBUG_FILE` is parsed only when `NCCL_DEBUG > NCCL_LOG_VERSION` (same as previous behavior)

Differential Revision: D38415208

fbshipit-source-id: 5689bbb798e73efb9e8594557666987f07e89a30
Этот коммит содержится в:
Ching-Hsiang Chu
2022-08-03 20:47:40 -07:00
коммит произвёл Sylvain Jeaugey
родитель 19ab67d172
Коммит e1d9b273b0
+15 -16
Просмотреть файл
@@ -26,6 +26,20 @@ void ncclDebugInit() {
pthread_mutex_lock(&ncclDebugLock);
if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
const char* nccl_debug = getenv("NCCL_DEBUG");
int tempNcclDebugLevel = -1;
if (nccl_debug == NULL) {
tempNcclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
tempNcclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
tempNcclDebugLevel = NCCL_LOG_WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
tempNcclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
tempNcclDebugLevel = NCCL_LOG_ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
tempNcclDebugLevel = NCCL_LOG_TRACE;
}
/* Parse the NCCL_DEBUG_SUBSYS env var
* This can be a comma separated list such as INIT,COLL
@@ -80,7 +94,7 @@ void ncclDebugInit() {
* NCCL_DEBUG level is > VERSION
*/
const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
int c = 0;
char debugFn[PATH_MAX+1] = "";
char *dfn = debugFn;
@@ -115,21 +129,6 @@ void ncclDebugInit() {
}
}
int tempNcclDebugLevel = -1;
if (nccl_debug == NULL) {
tempNcclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
tempNcclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
tempNcclDebugLevel = NCCL_LOG_WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
tempNcclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
tempNcclDebugLevel = NCCL_LOG_ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
tempNcclDebugLevel = NCCL_LOG_TRACE;
}
ncclEpoch = std::chrono::steady_clock::now();
__atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
pthread_mutex_unlock(&ncclDebugLock);