Added ERROR message class to handle fatal error messages. (#2002)

* Added ERROR message class to handle fatal error messages.

New ERROR message class will print the message in all debug level,
including none.

Change some of the fatal error message to be in ERROR instead of WARN.

Added new error handler function to print out more meaningful error
message in the future.

* Added CHANGELOG entry.

* Update CHANGELOG.md

Co-authored-by: Jeffrey Novotny <jnovotny@amd.com>

* Change to no longer reuse NONE as ERROR. ERROR is now a separated class.

* Update CHANGELOG.md

Co-authored-by: Jeffrey Novotny <jnovotny@amd.com>

---------

Co-authored-by: Jeffrey Novotny <jnovotny@amd.com>

[ROCm/rccl commit: 1ce83d5cc0]
This commit is contained in:
Arm Patinyasakdikul
2025-10-30 14:14:20 -07:00
کامیت شده توسط GitHub
والد 03e92dc942
کامیت 54194a17c3
14فایلهای تغییر یافته به همراه49 افزوده شده و 20 حذف شده
@@ -4,6 +4,10 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
## Unreleased - RCCL 2.27.7 for ROCm 7.2.0
### Changed
* RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
## Unreleased - RCCL 2.27.7 for ROCm 7.1.1
### Changed
@@ -9,7 +9,7 @@
#include <stdint.h>
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
@@ -7,7 +7,7 @@
#ifndef COMMON_H_
#define COMMON_H_
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
@@ -7,7 +7,7 @@
#ifndef COMMON_H_
#define COMMON_H_
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
@@ -7,7 +7,7 @@
#ifndef COMMON_H_
#define COMMON_H_
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
@@ -62,7 +62,8 @@ void mock_logger(ncclDebugLogLevel level, unsigned long flags,
// Convert log level to string
const char* level_str;
switch(level) {
case NCCL_LOG_NONE: level_str = "NONE"; break;
case NCCL_LOG_NONE: level_str = "NONE"; break;
case NCCL_LOG_ERROR: level_str = "ERROR"; break;
case NCCL_LOG_VERSION: level_str = "VERSION"; break;
case NCCL_LOG_WARN: level_str = "WARN"; break;
case NCCL_LOG_INFO: level_str = "INFO"; break;
@@ -7,7 +7,7 @@
#ifndef COMMON_H_
#define COMMON_H_
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
@@ -47,6 +47,8 @@ static void ncclDebugInit() {
ncclDebugFile = stdout;
}
if (nccl_debug == NULL) {
tempNcclDebugLevel = NCCL_LOG_ERROR;
} else if (strcasecmp(nccl_debug, "NONE") == 0) {
tempNcclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
tempNcclDebugLevel = NCCL_LOG_VERSION;
@@ -372,6 +374,8 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
auto delta = std::chrono::steady_clock::now() - ncclEpoch;
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %f %s:%d NCCL TRACE ", cudaDev, timestamp, filefunc, line);
} else if (level == NCCL_LOG_ERROR) {
len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] [FATAL ERROR]: ", cudaDev);
}
len = std::min(len, sizeof(buffer)-1); // prevent overflows
@@ -13,16 +13,14 @@
#define CUDACHECK(cmd) do { \
cudaError_t err = cmd; \
if( err != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
return ncclUnhandledCudaError; \
return rcclCudaErrorHandler(err); \
} \
} while(false)
#define CUDACHECKGOTO(cmd, RES, label) do { \
cudaError_t err = cmd; \
if( err != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
RES = ncclUnhandledCudaError; \
RES = rcclCudaErrorHandler(err); \
goto label; \
} \
} while(false)
@@ -25,6 +25,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
extern thread_local int ncclDebugNoWarn;
extern char ncclLastError[];
#define ERROR(...) ncclDebugLog(NCCL_LOG_ERROR, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
@@ -40,4 +41,22 @@ void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
void ncclResetDebugInit();
// RCCL custom error message handling.
static inline ncclResult_t rcclCudaErrorHandler(cudaError_t err) {
// Print the cuda error
ERROR("HIP failure: '%s'", cudaGetErrorString(err));
// Special error message here:
switch (err) {
case cudaErrorStreamCaptureInvalidated:
ERROR("Application is trying to use an invalidated stream to launch RCCL kernel. "
"This operation is invalid. RCCL is exiting.");
break;
default:
break;
}
return ncclUnhandledCudaError;
}
#endif
@@ -12,11 +12,12 @@
typedef enum {
NCCL_LOG_NONE = 0,
NCCL_LOG_VERSION = 1,
NCCL_LOG_WARN = 2,
NCCL_LOG_INFO = 3,
NCCL_LOG_ABORT = 4,
NCCL_LOG_TRACE = 5
NCCL_LOG_ERROR = 1,
NCCL_LOG_VERSION = 2,
NCCL_LOG_WARN = 3,
NCCL_LOG_INFO = 4,
NCCL_LOG_ABORT = 5,
NCCL_LOG_TRACE = 6
} ncclDebugLogLevel;
typedef enum {
+3 -2
مشاهده پرونده
@@ -149,7 +149,8 @@ ncclResult_t checkHsaEnvSetting() {
INFO(NCCL_INIT, "Hipruntime version: %d, firmware version: %d", hipRuntimeVersion, firmwareVersion);
if (!validHsaScratchEnvSetting(hsaScratchEnv, hipRuntimeVersion, firmwareVersion, devProp.gcnArchName)) {
// Always print out this warning message
printf("Fatal Error: HSA_NO_SCRATCH_RECLAIM=1 must be set to avoid performance degradation with HIP Runtime version:%d, GPU Firmware version:%d\n", hipRuntimeVersion, firmwareVersion);
ERROR("HSA_NO_SCRATCH_RECLAIM=1 must be set to avoid performance degradation with the current HIP configuration. (Runtime version:%d, GPU Firmware version:%d)", hipRuntimeVersion, firmwareVersion);
ERROR("Please set HSA_NO_SCRATCH_RECLAIM=1 and rerun.");
return ncclSystemError;
}
return ncclSuccess;
@@ -2433,7 +2434,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
// first call ncclInit, this will setup the environment
NCCLCHECKGOTO(ncclInit(), res, fail);
if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) {
if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel >= NCCL_LOG_VERSION && myrank == 0)) {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, showVersion);
}
@@ -259,7 +259,7 @@ ncclResult_t ncclStrongStreamRelease(
CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
}
if (ss->liveAcquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
WARN("%s", launchRaceFatalMsg);
ERROR("%s", launchRaceFatalMsg);
return ncclInvalidUsage;
}
} else {
@@ -321,7 +321,7 @@ ncclResult_t ncclStrongStreamRelease(
#endif
if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
WARN("%s", launchRaceFatalMsg);
ERROR("%s", launchRaceFatalMsg);
return ncclInvalidUsage;
}
}
@@ -139,7 +139,8 @@ static void ncclIbStatsFatalError(struct ncclIbStats* stat){
}
static ncclResult_t ncclIbStatsCheckFatalCount(struct ncclIbStats* stat, const char* funcName) {
if (ncclParamIbAsyncEvents() && __atomic_load_n(&stat->fatalErrorCount, __ATOMIC_RELAXED)) {
WARN("communicator encountered a fatal error (detected in %s)\n", funcName);
ERROR("RCCL encountered a communication fatal error (detected in %s)\n", funcName);
ERROR("RCCL cannot recover from this network failure and now exiting. Please check the network health.");
return ncclSystemError;
}
return ncclSuccess;