diff --git a/CHANGELOG.md b/CHANGELOG.md index 9234702d52..7b7719971e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: ## Unreleased - RCCL 2.27.7 for ROCm 7.2.0 +### Changed + +* RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`. + ## Unreleased - RCCL 2.27.7 for ROCm 7.1.1 ### Changed diff --git a/ext-net/example/nccl/common.h b/ext-net/example/nccl/common.h index 5aec2f7bba..d5da339021 100644 --- a/ext-net/example/nccl/common.h +++ b/ext-net/example/nccl/common.h @@ -9,7 +9,7 @@ #include -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/ext-profiler/example/nccl/common.h b/ext-profiler/example/nccl/common.h index 912925225c..6afc97b4e7 100644 --- a/ext-profiler/example/nccl/common.h +++ b/ext-profiler/example/nccl/common.h @@ -7,7 +7,7 @@ #ifndef COMMON_H_ #define COMMON_H_ -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/ext-tuner/basic/nccl/common.h b/ext-tuner/basic/nccl/common.h index 912925225c..6afc97b4e7 100644 --- a/ext-tuner/basic/nccl/common.h +++ b/ext-tuner/basic/nccl/common.h @@ -7,7 +7,7 @@ #ifndef COMMON_H_ #define COMMON_H_ -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/ext-tuner/example/nccl/common.h b/ext-tuner/example/nccl/common.h index 912925225c..6afc97b4e7 100644 --- a/ext-tuner/example/nccl/common.h +++ b/ext-tuner/example/nccl/common.h @@ -7,7 +7,7 @@ #ifndef COMMON_H_ #define COMMON_H_ -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/ext-tuner/example/test/test_plugin.c b/ext-tuner/example/test/test_plugin.c index 28897c4490..050d83ff0e 100644 --- a/ext-tuner/example/test/test_plugin.c +++ b/ext-tuner/example/test/test_plugin.c @@ -62,7 +62,8 @@ void mock_logger(ncclDebugLogLevel level, unsigned long flags, // Convert log level to string const char* level_str; switch(level) { - case NCCL_LOG_NONE: level_str = "NONE"; break; + case NCCL_LOG_NONE: level_str = "NONE"; break; + case NCCL_LOG_ERROR: level_str = "ERROR"; break; case NCCL_LOG_VERSION: level_str = "VERSION"; break; case NCCL_LOG_WARN: level_str = "WARN"; break; case NCCL_LOG_INFO: level_str = "INFO"; break; diff --git a/ext-tuner/model_demo/nccl/common.h b/ext-tuner/model_demo/nccl/common.h index 912925225c..6afc97b4e7 100644 --- a/ext-tuner/model_demo/nccl/common.h +++ b/ext-tuner/model_demo/nccl/common.h @@ -7,7 +7,7 @@ #ifndef COMMON_H_ #define COMMON_H_ -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/src/debug.cc b/src/debug.cc index e5a0ed9c26..2bdca5735d 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -47,6 +47,8 @@ static void ncclDebugInit() { ncclDebugFile = stdout; } if (nccl_debug == NULL) { + tempNcclDebugLevel = NCCL_LOG_ERROR; + } else if (strcasecmp(nccl_debug, "NONE") == 0) { tempNcclDebugLevel = NCCL_LOG_NONE; } else if (strcasecmp(nccl_debug, "VERSION") == 0) { tempNcclDebugLevel = NCCL_LOG_VERSION; @@ -372,6 +374,8 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file auto delta = std::chrono::steady_clock::now() - ncclEpoch; double timestamp = std::chrono::duration_cast>(delta).count()*1000; len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %f %s:%d NCCL TRACE ", cudaDev, timestamp, filefunc, line); + } else if (level == NCCL_LOG_ERROR) { + len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] [FATAL ERROR]: ", cudaDev); } len = std::min(len, sizeof(buffer)-1); // prevent overflows diff --git a/src/include/checks.h b/src/include/checks.h index cbb5a2de41..50c8f4c3ba 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -13,16 +13,14 @@ #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ - return ncclUnhandledCudaError; \ + return rcclCudaErrorHandler(err); \ } \ } while(false) #define CUDACHECKGOTO(cmd, RES, label) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ - RES = ncclUnhandledCudaError; \ + RES = rcclCudaErrorHandler(err); \ goto label; \ } \ } while(false) diff --git a/src/include/debug.h b/src/include/debug.h index 4e50cbf5a7..84a1dea26e 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -25,6 +25,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file extern thread_local int ncclDebugNoWarn; extern char ncclLastError[]; +#define ERROR(...) ncclDebugLog(NCCL_LOG_ERROR, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) @@ -40,4 +41,22 @@ void ncclSetThreadName(pthread_t thread, const char *fmt, ...); void ncclResetDebugInit(); +// RCCL custom error message handling. +static inline ncclResult_t rcclCudaErrorHandler(cudaError_t err) { + + // Print the cuda error + ERROR("HIP failure: '%s'", cudaGetErrorString(err)); + + // Special error message here: + switch (err) { + case cudaErrorStreamCaptureInvalidated: + ERROR("Application is trying to use an invalidated stream to launch RCCL kernel. " + "This operation is invalid. RCCL is exiting."); + break; + default: + break; + } + return ncclUnhandledCudaError; +} + #endif diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h index 7ddf367417..c2140289ed 100644 --- a/src/include/nccl_common.h +++ b/src/include/nccl_common.h @@ -12,11 +12,12 @@ typedef enum { NCCL_LOG_NONE = 0, - NCCL_LOG_VERSION = 1, - NCCL_LOG_WARN = 2, - NCCL_LOG_INFO = 3, - NCCL_LOG_ABORT = 4, - NCCL_LOG_TRACE = 5 + NCCL_LOG_ERROR = 1, + NCCL_LOG_VERSION = 2, + NCCL_LOG_WARN = 3, + NCCL_LOG_INFO = 4, + NCCL_LOG_ABORT = 5, + NCCL_LOG_TRACE = 6 } ncclDebugLogLevel; typedef enum { diff --git a/src/init.cc b/src/init.cc index 94bb566459..07ce94b219 100644 --- a/src/init.cc +++ b/src/init.cc @@ -149,7 +149,8 @@ ncclResult_t checkHsaEnvSetting() { INFO(NCCL_INIT, "Hipruntime version: %d, firmware version: %d", hipRuntimeVersion, firmwareVersion); if (!validHsaScratchEnvSetting(hsaScratchEnv, hipRuntimeVersion, firmwareVersion, devProp.gcnArchName)) { // Always print out this warning message - printf("Fatal Error: HSA_NO_SCRATCH_RECLAIM=1 must be set to avoid performance degradation with HIP Runtime version:%d, GPU Firmware version:%d\n", hipRuntimeVersion, firmwareVersion); + ERROR("HSA_NO_SCRATCH_RECLAIM=1 must be set to avoid performance degradation with the current HIP configuration. (Runtime version:%d, GPU Firmware version:%d)", hipRuntimeVersion, firmwareVersion); + ERROR("Please set HSA_NO_SCRATCH_RECLAIM=1 and rerun."); return ncclSystemError; } return ncclSuccess; @@ -2433,7 +2434,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId // first call ncclInit, this will setup the environment NCCLCHECKGOTO(ncclInit(), res, fail); - if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) { + if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel >= NCCL_LOG_VERSION && myrank == 0)) { static pthread_once_t once = PTHREAD_ONCE_INIT; pthread_once(&once, showVersion); } diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc index db676635cc..39b7ebf856 100644 --- a/src/misc/strongstream.cc +++ b/src/misc/strongstream.cc @@ -259,7 +259,7 @@ ncclResult_t ncclStrongStreamRelease( CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream)); } if (ss->liveAcquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) { - WARN("%s", launchRaceFatalMsg); + ERROR("%s", launchRaceFatalMsg); return ncclInvalidUsage; } } else { @@ -321,7 +321,7 @@ ncclResult_t ncclStrongStreamRelease( #endif if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) { - WARN("%s", launchRaceFatalMsg); + ERROR("%s", launchRaceFatalMsg); return ncclInvalidUsage; } } diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 44f474967a..1d3c7a0b25 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -139,7 +139,8 @@ static void ncclIbStatsFatalError(struct ncclIbStats* stat){ } static ncclResult_t ncclIbStatsCheckFatalCount(struct ncclIbStats* stat, const char* funcName) { if (ncclParamIbAsyncEvents() && __atomic_load_n(&stat->fatalErrorCount, __ATOMIC_RELAXED)) { - WARN("communicator encountered a fatal error (detected in %s)\n", funcName); + ERROR("RCCL encountered a communication fatal error (detected in %s)\n", funcName); + ERROR("RCCL cannot recover from this network failure and now exiting. Please check the network health."); return ncclSystemError; } return ncclSuccess;