Files
rocm-systems/src/debug.cc
T
Sylvain Jeaugey b221128eca 2.6.4-1
Add support for network collectives.
Add support for XML topology dump/injection.
Add text values for GDR and P2P Levels, including "NVL".
Add speed detection for PCI, Infiniband and Ethernet cards.
Add CPU detection for ARM and AMD CPUs.
Add support for adaptive routing on Infiniband.
Change NET plugin API to v3 : merge PCI path and GPU pointer
  capability into a single structure and add other properties.
2020-03-20 14:58:36 -07:00

168 rader
5.9 KiB
C++

/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "nccl_net.h"
#include <stdlib.h>
#include <stdarg.h>
int ncclDebugLevel = -1;
thread_local int ncclDebugNoWarn = 0;
uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
FILE *ncclDebugFile = stdout;
pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
void ncclDebugInit() {
pthread_mutex_lock(&ncclDebugLock);
if (ncclDebugLevel != -1) return;
const char* nccl_debug = getenv("NCCL_DEBUG");
if (nccl_debug == NULL) {
ncclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
ncclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
ncclDebugLevel = NCCL_LOG_WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
ncclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
ncclDebugLevel = NCCL_LOG_ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
ncclDebugLevel = NCCL_LOG_TRACE;
}
/* Parse the NCCL_DEBUG_SUBSYS env var
* This can be a comma separated list such as INIT,COLL
* or ^INIT,COLL etc
*/
char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS");
if (ncclDebugSubsysEnv != NULL) {
int invert = 0;
if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
ncclDebugMask = invert ? ~0ULL : 0ULL;
char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv);
char *subsys = strtok(ncclDebugSubsys, ",");
while (subsys != NULL) {
uint64_t mask = 0;
if (strcasecmp(subsys, "INIT") == 0) {
mask = NCCL_INIT;
} else if (strcasecmp(subsys, "COLL") == 0) {
mask = NCCL_COLL;
} else if (strcasecmp(subsys, "P2P") == 0) {
mask = NCCL_P2P;
} else if (strcasecmp(subsys, "SHM") == 0) {
mask = NCCL_SHM;
} else if (strcasecmp(subsys, "NET") == 0) {
mask = NCCL_NET;
} else if (strcasecmp(subsys, "GRAPH") == 0) {
mask = NCCL_GRAPH;
} else if (strcasecmp(subsys, "TUNING") == 0) {
mask = NCCL_TUNING;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL;
}
if (mask) {
if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
}
subsys = strtok(NULL, ",");
}
free(ncclDebugSubsys);
}
/* Parse and expand the NCCL_DEBUG_FILE path and
* then create the debug file. But don't bother unless the
* NCCL_DEBUG level is > VERSION
*/
const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
int c = 0;
char debugFn[PATH_MAX+1] = "";
char *dfn = debugFn;
while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
if (ncclDebugFileEnv[c++] != '%') {
*dfn++ = ncclDebugFileEnv[c-1];
continue;
}
switch (ncclDebugFileEnv[c++]) {
case '%': // Double %
*dfn++ = '%';
break;
case 'h': // %h = hostname
char hostname[1024];
getHostName(hostname, 1024, '.');
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
break;
case 'p': // %p = pid
dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
break;
default: // Echo everything we don't understand
*dfn++ = '%';
*dfn++ = ncclDebugFileEnv[c-1];
break;
}
}
*dfn = '\0';
if (debugFn[0] != '\0') {
FILE *file = fopen(debugFn, "w");
if (file != NULL) {
ncclDebugFile = file;
}
}
}
#ifdef ENABLE_TRACE
ncclEpoch = std::chrono::high_resolution_clock::now();
#endif
pthread_mutex_unlock(&ncclDebugLock);
}
/* Common logging function used by the INFO, WARN and TRACE macros
* Also exported to the dynamically loadable Net transport modules so
* they can share the debugging mechanisms and output files
*/
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (ncclDebugLevel == -1) ncclDebugInit();
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
char hostname[1024];
getHostName(hostname, 1024, '.');
int cudaDev;
cudaGetDevice(&cudaDev);
char buffer[1024];
size_t len = 0;
pthread_mutex_lock(&ncclDebugLock);
if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
len = snprintf(buffer, sizeof(buffer),
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
len = snprintf(buffer, sizeof(buffer),
"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
#ifdef ENABLE_TRACE
else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
len = snprintf(buffer, sizeof(buffer),
"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
}
#endif
if (len) {
va_list vargs;
va_start(vargs, fmt);
(void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
va_end(vargs);
fprintf(ncclDebugFile,"%s\n", buffer);
fflush(ncclDebugFile);
}
pthread_mutex_unlock(&ncclDebugLock);
// If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
hostname, getpid(), gettid(), cudaDev, filefunc, line);
abort();
}
}