From 1ce83d5cc07b26e748128c33d2566b3d959c6ae9 Mon Sep 17 00:00:00 2001 From: Arm Patinyasakdikul Date: Thu, 30 Oct 2025 14:14:20 -0700 Subject: [PATCH] Added ERROR message class to handle fatal error messages. (#2002) * Added ERROR message class to handle fatal error messages. New ERROR message class will print the message in all debug level, including none. Change some of the fatal error message to be in ERROR instead of WARN. Added new error handler function to print out more meaningful error message in the future. * Added CHANGELOG entry. * Update CHANGELOG.md Co-authored-by: Jeffrey Novotny * Change to no longer reuse NONE as ERROR. ERROR is now a separated class. * Update CHANGELOG.md Co-authored-by: Jeffrey Novotny --------- Co-authored-by: Jeffrey Novotny --- CHANGELOG.md | 4 ++++ ext-net/example/nccl/common.h | 2 +- ext-profiler/example/nccl/common.h | 2 +- ext-tuner/basic/nccl/common.h | 2 +- ext-tuner/example/nccl/common.h | 2 +- ext-tuner/example/test/test_plugin.c | 3 ++- ext-tuner/model_demo/nccl/common.h | 2 +- src/debug.cc | 4 ++++ src/include/checks.h | 6 ++---- src/include/debug.h | 19 +++++++++++++++++++ src/include/nccl_common.h | 11 ++++++----- src/init.cc | 5 +++-- src/misc/strongstream.cc | 4 ++-- src/transport/net_ib.cc | 3 ++- 14 files changed, 49 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9234702d52..7b7719971e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: ## Unreleased - RCCL 2.27.7 for ROCm 7.2.0 +### Changed + +* RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`. + ## Unreleased - RCCL 2.27.7 for ROCm 7.1.1 ### Changed diff --git a/ext-net/example/nccl/common.h b/ext-net/example/nccl/common.h index 5aec2f7bba..d5da339021 100644 --- a/ext-net/example/nccl/common.h +++ b/ext-net/example/nccl/common.h @@ -9,7 +9,7 @@ #include -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/ext-profiler/example/nccl/common.h b/ext-profiler/example/nccl/common.h index 912925225c..6afc97b4e7 100644 --- a/ext-profiler/example/nccl/common.h +++ b/ext-profiler/example/nccl/common.h @@ -7,7 +7,7 @@ #ifndef COMMON_H_ #define COMMON_H_ -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/ext-tuner/basic/nccl/common.h b/ext-tuner/basic/nccl/common.h index 912925225c..6afc97b4e7 100644 --- a/ext-tuner/basic/nccl/common.h +++ b/ext-tuner/basic/nccl/common.h @@ -7,7 +7,7 @@ #ifndef COMMON_H_ #define COMMON_H_ -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/ext-tuner/example/nccl/common.h b/ext-tuner/example/nccl/common.h index 912925225c..6afc97b4e7 100644 --- a/ext-tuner/example/nccl/common.h +++ b/ext-tuner/example/nccl/common.h @@ -7,7 +7,7 @@ #ifndef COMMON_H_ #define COMMON_H_ -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/ext-tuner/example/test/test_plugin.c b/ext-tuner/example/test/test_plugin.c index 28897c4490..050d83ff0e 100644 --- a/ext-tuner/example/test/test_plugin.c +++ b/ext-tuner/example/test/test_plugin.c @@ -62,7 +62,8 @@ void mock_logger(ncclDebugLogLevel level, unsigned long flags, // Convert log level to string const char* level_str; switch(level) { - case NCCL_LOG_NONE: level_str = "NONE"; break; + case NCCL_LOG_NONE: level_str = "NONE"; break; + case NCCL_LOG_ERROR: level_str = "ERROR"; break; case NCCL_LOG_VERSION: level_str = "VERSION"; break; case NCCL_LOG_WARN: level_str = "WARN"; break; case NCCL_LOG_INFO: level_str = "INFO"; break; diff --git a/ext-tuner/model_demo/nccl/common.h b/ext-tuner/model_demo/nccl/common.h index 912925225c..6afc97b4e7 100644 --- a/ext-tuner/model_demo/nccl/common.h +++ b/ext-tuner/model_demo/nccl/common.h @@ -7,7 +7,7 @@ #ifndef COMMON_H_ #define COMMON_H_ -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/src/debug.cc b/src/debug.cc index e5a0ed9c26..2bdca5735d 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -47,6 +47,8 @@ static void ncclDebugInit() { ncclDebugFile = stdout; } if (nccl_debug == NULL) { + tempNcclDebugLevel = NCCL_LOG_ERROR; + } else if (strcasecmp(nccl_debug, "NONE") == 0) { tempNcclDebugLevel = NCCL_LOG_NONE; } else if (strcasecmp(nccl_debug, "VERSION") == 0) { tempNcclDebugLevel = NCCL_LOG_VERSION; @@ -372,6 +374,8 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file auto delta = std::chrono::steady_clock::now() - ncclEpoch; double timestamp = std::chrono::duration_cast>(delta).count()*1000; len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %f %s:%d NCCL TRACE ", cudaDev, timestamp, filefunc, line); + } else if (level == NCCL_LOG_ERROR) { + len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] [FATAL ERROR]: ", cudaDev); } len = std::min(len, sizeof(buffer)-1); // prevent overflows diff --git a/src/include/checks.h b/src/include/checks.h index cbb5a2de41..50c8f4c3ba 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -13,16 +13,14 @@ #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ - return ncclUnhandledCudaError; \ + return rcclCudaErrorHandler(err); \ } \ } while(false) #define CUDACHECKGOTO(cmd, RES, label) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ - RES = ncclUnhandledCudaError; \ + RES = rcclCudaErrorHandler(err); \ goto label; \ } \ } while(false) diff --git a/src/include/debug.h b/src/include/debug.h index 4e50cbf5a7..84a1dea26e 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -25,6 +25,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file extern thread_local int ncclDebugNoWarn; extern char ncclLastError[]; +#define ERROR(...) ncclDebugLog(NCCL_LOG_ERROR, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) @@ -40,4 +41,22 @@ void ncclSetThreadName(pthread_t thread, const char *fmt, ...); void ncclResetDebugInit(); +// RCCL custom error message handling. +static inline ncclResult_t rcclCudaErrorHandler(cudaError_t err) { + + // Print the cuda error + ERROR("HIP failure: '%s'", cudaGetErrorString(err)); + + // Special error message here: + switch (err) { + case cudaErrorStreamCaptureInvalidated: + ERROR("Application is trying to use an invalidated stream to launch RCCL kernel. " + "This operation is invalid. RCCL is exiting."); + break; + default: + break; + } + return ncclUnhandledCudaError; +} + #endif diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h index 7ddf367417..c2140289ed 100644 --- a/src/include/nccl_common.h +++ b/src/include/nccl_common.h @@ -12,11 +12,12 @@ typedef enum { NCCL_LOG_NONE = 0, - NCCL_LOG_VERSION = 1, - NCCL_LOG_WARN = 2, - NCCL_LOG_INFO = 3, - NCCL_LOG_ABORT = 4, - NCCL_LOG_TRACE = 5 + NCCL_LOG_ERROR = 1, + NCCL_LOG_VERSION = 2, + NCCL_LOG_WARN = 3, + NCCL_LOG_INFO = 4, + NCCL_LOG_ABORT = 5, + NCCL_LOG_TRACE = 6 } ncclDebugLogLevel; typedef enum { diff --git a/src/init.cc b/src/init.cc index 94bb566459..07ce94b219 100644 --- a/src/init.cc +++ b/src/init.cc @@ -149,7 +149,8 @@ ncclResult_t checkHsaEnvSetting() { INFO(NCCL_INIT, "Hipruntime version: %d, firmware version: %d", hipRuntimeVersion, firmwareVersion); if (!validHsaScratchEnvSetting(hsaScratchEnv, hipRuntimeVersion, firmwareVersion, devProp.gcnArchName)) { // Always print out this warning message - printf("Fatal Error: HSA_NO_SCRATCH_RECLAIM=1 must be set to avoid performance degradation with HIP Runtime version:%d, GPU Firmware version:%d\n", hipRuntimeVersion, firmwareVersion); + ERROR("HSA_NO_SCRATCH_RECLAIM=1 must be set to avoid performance degradation with the current HIP configuration. (Runtime version:%d, GPU Firmware version:%d)", hipRuntimeVersion, firmwareVersion); + ERROR("Please set HSA_NO_SCRATCH_RECLAIM=1 and rerun."); return ncclSystemError; } return ncclSuccess; @@ -2433,7 +2434,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId // first call ncclInit, this will setup the environment NCCLCHECKGOTO(ncclInit(), res, fail); - if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) { + if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel >= NCCL_LOG_VERSION && myrank == 0)) { static pthread_once_t once = PTHREAD_ONCE_INIT; pthread_once(&once, showVersion); } diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc index db676635cc..39b7ebf856 100644 --- a/src/misc/strongstream.cc +++ b/src/misc/strongstream.cc @@ -259,7 +259,7 @@ ncclResult_t ncclStrongStreamRelease( CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream)); } if (ss->liveAcquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) { - WARN("%s", launchRaceFatalMsg); + ERROR("%s", launchRaceFatalMsg); return ncclInvalidUsage; } } else { @@ -321,7 +321,7 @@ ncclResult_t ncclStrongStreamRelease( #endif if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) { - WARN("%s", launchRaceFatalMsg); + ERROR("%s", launchRaceFatalMsg); return ncclInvalidUsage; } } diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 44f474967a..1d3c7a0b25 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -139,7 +139,8 @@ static void ncclIbStatsFatalError(struct ncclIbStats* stat){ } static ncclResult_t ncclIbStatsCheckFatalCount(struct ncclIbStats* stat, const char* funcName) { if (ncclParamIbAsyncEvents() && __atomic_load_n(&stat->fatalErrorCount, __ATOMIC_RELAXED)) { - WARN("communicator encountered a fatal error (detected in %s)\n", funcName); + ERROR("RCCL encountered a communication fatal error (detected in %s)\n", funcName); + ERROR("RCCL cannot recover from this network failure and now exiting. Please check the network health."); return ncclSystemError; } return ncclSuccess;