From c441a87a00a7ad16c8619ad603feffbaff63fa81 Mon Sep 17 00:00:00 2001 From: Venkateshwar Reddy Kandula Date: Tue, 30 Sep 2025 11:42:42 -0500 Subject: [PATCH] [rocprofiler-sdk][RCCL] RCCL New API changes for RCCL_API_TRACE_VERSION_PATCH = 2 (#985) - Address build issue with RCCL sync with NCCL commit: ROCm/rccl@08a7be2 - Patch Version Bump-up PR: ROCm/rccl#1916 --- .../rocprofiler-sdk/cxx/enum_string.hpp | 7 + .../include/rocprofiler-sdk/rccl/api_args.h | 24 ++ .../include/rocprofiler-sdk/rccl/api_id.h | 5 + .../rocprofiler-sdk/rccl/details/api_trace.h | 20 +- .../rocprofiler-sdk/rccl/details/rccl.h | 270 ++++++++++++++---- .../source/lib/rocprofiler-sdk/rccl/abi.cpp | 6 + .../lib/rocprofiler-sdk/rccl/rccl.def.cpp | 5 + .../lib/rocprofiler-sdk/registration.cpp | 75 ++++- 8 files changed, 350 insertions(+), 62 deletions(-) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp index cb6d88cb42..7801ea8726 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp @@ -1140,11 +1140,18 @@ ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclCommDeregister); #if RCCL_API_TRACE_VERSION_PATCH >= 1 ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclAllReduceWithBias); #endif +#if RCCL_API_TRACE_VERSION_PATCH >= 2 +ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclCommShrink); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclCommWindowRegister); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclCommWindowDeregister); +#endif #if RCCL_API_TRACE_VERSION_PATCH == 0 static_assert(ROCPROFILER_RCCL_API_ID_LAST == 37); #elif RCCL_API_TRACE_VERSION_PATCH == 1 static_assert(ROCPROFILER_RCCL_API_ID_LAST == 38); +#elif RCCL_API_TRACE_VERSION_PATCH == 2 +static_assert(ROCPROFILER_RCCL_API_ID_LAST == 41); #else # if !defined(ROCPROFILER_UNSAFE_NO_VERSION_CHECK) && \ (defined(ROCPROFILER_CI) && ROCPROFILER_CI > 0) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/api_args.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/api_args.h index 769d3951d2..8203513b15 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/api_args.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/api_args.h @@ -330,6 +330,30 @@ typedef union rocprofiler_rccl_api_args_t const void* acc; } ncclAllReduceWithBias; #endif +#if RCCL_API_TRACE_VERSION_PATCH >= 2 + struct + { + ncclComm_t comm; + int* excludeRanksList; + int excludeRanksCount; + ncclComm_t* newcomm; + ncclConfig_t* config; + int shrinkFlags; + } ncclCommShrink; + struct + { + ncclComm_t comm; + void* buff; + size_t size; + ncclWindow_t* win; + int winFlags; + } ncclCommWindowRegister; + struct + { + ncclComm_t comm; + ncclWindow_t win; + } ncclCommWindowDeregister; +#endif } rocprofiler_rccl_api_args_t; ROCPROFILER_EXTERN_C_FINI diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/api_id.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/api_id.h index 7cfb8a4a4c..afb3369a65 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/api_id.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/api_id.h @@ -70,6 +70,11 @@ typedef enum rocprofiler_rccl_api_id_t // NOLINT(performance-enum-size) ROCPROFILER_RCCL_API_ID_ncclCommDeregister, #if RCCL_API_TRACE_VERSION_PATCH >= 1 ROCPROFILER_RCCL_API_ID_ncclAllReduceWithBias, +#endif +#if RCCL_API_TRACE_VERSION_PATCH >= 2 + ROCPROFILER_RCCL_API_ID_ncclCommShrink, + ROCPROFILER_RCCL_API_ID_ncclCommWindowRegister, + ROCPROFILER_RCCL_API_ID_ncclCommWindowDeregister, #endif ROCPROFILER_RCCL_API_ID_LAST, } rocprofiler_rccl_api_id_t; diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/api_trace.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/api_trace.h index 1c8694b5ae..64a25f8429 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/api_trace.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/api_trace.h @@ -47,7 +47,7 @@ #define RCCL_API_TRACE_VERSION_MAJOR 0 // should be increased every time new members are added to existing dispatch tables -#define RCCL_API_TRACE_VERSION_PATCH 1 +#define RCCL_API_TRACE_VERSION_PATCH 2 #if !defined(RCCL_EXTERN_C_INIT) # ifdef __cplusplus @@ -182,6 +182,13 @@ typedef ncclResult_t (*ncclCommDestroy_fn_t)(ncclComm_t comm); typedef ncclResult_t (*ncclCommAbort_fn_t)(ncclComm_t comm); +typedef ncclResult_t (*ncclCommShrink_fn_t)(ncclComm_t comm, + int* excludeRanksList, + int excludeRanksCount, + ncclComm_t* newcomm, + ncclConfig_t* config, + int shrinkFlags); + typedef ncclResult_t (*ncclCommSplit_fn_t)(ncclComm_t comm, int color, int key, @@ -232,6 +239,14 @@ typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm, typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, void* handle); +typedef ncclResult_t (*ncclCommWindowRegister_fn_t)(ncclComm_t comm, + void* buff, + size_t size, + ncclWindow_t* win, + int winFlags); + +typedef ncclResult_t (*ncclCommWindowDeregister_fn_t)(ncclComm_t comm, ncclWindow_t win); + typedef struct rcclApiFuncTable { uint64_t size; @@ -273,6 +288,9 @@ typedef struct rcclApiFuncTable ncclCommRegister_fn_t ncclCommRegister_fn; ncclCommDeregister_fn_t ncclCommDeregister_fn; ncclAllReduceWithBias_fn_t ncclAllReduceWithBias_fn; + ncclCommShrink_fn_t ncclCommShrink_fn; + ncclCommWindowRegister_fn_t ncclCommWindowRegister_fn; + ncclCommWindowDeregister_fn_t ncclCommWindowDeregister_fn; } rcclApiFuncTable; RCCL_EXTERN_C_FINI diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/rccl.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/rccl.h index e114a26d78..b67845fb1b 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/rccl.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/rccl.h @@ -5,7 +5,6 @@ * * See LICENSE.txt for license information ************************************************************************/ - #ifndef NCCL_H_ #define NCCL_H_ @@ -13,18 +12,19 @@ #include #define NCCL_MAJOR 2 -#define NCCL_MINOR 20 -#define NCCL_PATCH 5 +#define NCCL_MINOR 27 +#define NCCL_PATCH 3 #define NCCL_SUFFIX "" -#define NCCL_VERSION_CODE 22005 +#define NCCL_VERSION_CODE 22703 #define NCCL_VERSION(X, Y, Z) \ (((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z)) -#define RCCL_BFLOAT16 1 -#define RCCL_FLOAT8 1 -#define RCCL_GATHER_SCATTER 1 -#define RCCL_ALLTOALLV 1 +#define RCCL_BFLOAT16 1 +#define RCCL_FLOAT8 1 +#define RCCL_GATHER_SCATTER 1 +#define RCCL_ALLTOALLV 1 +#define RCCL_ALLREDUCE_WITH_BIAS 1 #ifdef __cplusplus extern "C" { @@ -35,7 +35,8 @@ extern "C" { /*! @brief Opaque handle to communicator @details A communicator contains information required to facilitate collective communications calls */ -typedef const struct ncclComm* ncclComm_t; +typedef struct ncclComm* ncclComm_t; +typedef struct ncclWindow* ncclWindow_t; #define NCCL_COMM_NULL NULL #define NCCL_UNIQUE_ID_BYTES 128 @@ -47,7 +48,7 @@ typedef struct } ncclUniqueId; /*! @defgroup rccl_result_code Result Codes - @brief The various result codes that RCCL API calls may return + @details The various result codes that RCCL API calls may return @{ */ /*! @brief Result type @@ -69,17 +70,29 @@ typedef enum #define NCCL_CONFIG_UNDEF_INT INT_MIN #define NCCL_CONFIG_UNDEF_PTR NULL #define NCCL_SPLIT_NOCOLOR -1 +#define NCCL_UNDEF_FLOAT -1.0f + +/* Window Registration flags */ +#define NCCL_WIN_DEFAULT 0x00 +#define NCCL_WIN_COLL_SYMMETRIC 0x01 + +/* NCCL performance policy */ +#define NCCL_CTA_POLICY_DEFAULT 0x00 +#define NCCL_CTA_POLICY_EFFICIENCY 0x01 + +/* ncclCommShrink flags*/ +#define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */ +#define NCCL_SHRINK_ABORT \ + 0x01 /* First, terminate ongoing parent operations, and then shrink the parent communicator */ /*! @defgroup rccl_config_type Communicator Configuration - @brief Structure that allows for customizing Communicator behavior via + @details Structure that allows for customizing Communicator behavior via ncclCommInitRankConfig @{ */ -/** - * @defgroup Communicator configuration - * @brief Users can assign value to attributes to specify the behavior of a communicator. - */ -typedef struct ncclConfig_v21700 +/*! @brief Communicator configuration + @details Users can assign value to attributes to specify the behavior of a communicator */ +typedef struct ncclConfig_v22700 { /* attributes that users should never touch. */ size_t size; /*!< Should not be touched */ @@ -92,6 +105,12 @@ typedef struct ncclConfig_v21700 int maxCTAs; /*!< Maximum number of cooperative thread arrays (blocks) */ const char* netName; /*!< Force NCCL to use a specfic network */ int splitShare; /*!< Allow communicators to share resources */ + int trafficClass; /*!< Traffic class*/ + const char* commName; /*!< Name of the communicator*/ + int collnetEnable; /*!< Check for collnet enablement*/ + int CTAPolicy; /*!< CTA Policy*/ + int shrinkShare; /*!< Shrink size*/ + int nvlsCTAs; /*!< Number of NVLS cooperative thread arrays (blocks)*/ } ncclConfig_t; /* Config initializer must be assigned to initialize config structure when it is created. @@ -106,10 +125,35 @@ typedef struct ncclConfig_v21700 NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ NCCL_CONFIG_UNDEF_PTR, /* netName */ \ - NCCL_CONFIG_UNDEF_INT /* splitShare */ \ + NCCL_CONFIG_UNDEF_INT, /* splitShare */ \ + NCCL_CONFIG_UNDEF_INT, /* trafficClass */ \ + NCCL_CONFIG_UNDEF_PTR, /* commName */ \ + NCCL_CONFIG_UNDEF_INT, /* collnetEnable */ \ + NCCL_CONFIG_UNDEF_INT, /* CTAPolicy */ \ + NCCL_CONFIG_UNDEF_INT, /* shrinkShare */ \ + NCCL_CONFIG_UNDEF_INT, /* nvlsCTAs */ \ } /*! @} */ +/* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */ +typedef struct ncclSimInfo_v22200 +{ + size_t size; + unsigned int magic; + unsigned int version; + float estimatedTime; +} ncclSimInfo_t; + +/* NCCL_SIM_INFO_INITIALIZER must be assigned to initialize simInfo structure when it is created. + * Not initialized simInfo will result in NCCL error. */ +#define NCCL_SIM_INFO_INITIALIZER \ + { \ + sizeof(ncclSimInfo_t), /* size */ \ + 0x74685283, /* magic */ \ + NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ + NCCL_UNDEF_FLOAT /* estimated time */ \ + } + /* NCCL malloc and free function for all types of NCCL optimizations * (e.g. user buffer registration). The actual allocated size might * be larger than requested due to granularity requirement. */ @@ -124,7 +168,7 @@ ncclResult_t pncclMemFree(void* ptr); /*! @defgroup rccl_api_version Version Information - @brief API call that returns RCCL version + @details API call that returns RCCL version @{ */ /*! @brief Return the RCCL_VERSION_CODE of RCCL in the supplied integer. @@ -142,7 +186,7 @@ pncclGetVersion(int* version); /*! @} */ /*! @defgroup rccl_api_communicator Communicator Initialization/Destruction - @brief API calls that operate on communicators. + @details API calls that operate on communicators. Communicators objects are used to launch collective communication operations. Unique ranks between 0 and N-1 must be assigned to each HIP device participating in the same Communicator. @@ -293,8 +337,69 @@ pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclCon /*! @endcond */ /*! @} */ +/*! @brief Shrink existing communicator. + @details Ranks in excludeRanksList will be removed form the existing communicator. + Within the new communicator, ranks will be re-ordered to fill the gap of removed + ones. If config is NULL, the new communicator will inherit the original communicator's + configuration. The flag enables NCCL to adapt to various states of the parent communicator, see + NCCL_SHRINK flags. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Original communicator object for this rank + @param[in] excludeRanksList List of ranks to be exluded + @param[in] excludeRanksCount Number of ranks to be excluded + @param[out] newcomm Pointer to new communicator + @param[in] config Config file for new communicator. May be NULL to inherit from + comm + @param[in] shrinkFlags Flag to adapt to various states of the parent communicator + (see NCCL_SHRINK flags)*/ +ncclResult_t +ncclCommShrink(ncclComm_t comm, + int* excludeRanksList, + int excludeRanksCount, + ncclComm_t* newcomm, + ncclConfig_t* config, + int shrinkFlags); +ncclResult_t +pncclCommShrink(ncclComm_t comm, + int* excludeRanksList, + int excludeRanksCount, + ncclComm_t* newcomm, + ncclConfig_t* config, + int shrinkFlags); + +/*! @brief Creates a new communicator (multi thread/process version), similar to + ncclCommInitRankConfig. + @details Allows to use more than one ncclUniqueId (up to one per rank), + indicated by nId, to accelerate the init operation. + The number of ncclUniqueIds and their order must be the same for every rank. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] newcomm Pointer to new communicator + @param[in] nranks Total number of ranks participating in this communicator + @param[in] myrank Current rank + @param[in] nId Number of unique IDs + @param[in] commIds List of unique IDs + @param[in] config Config file for new communicator. May be NULL to inherit from comm */ +ncclResult_t +ncclCommInitRankScalable(ncclComm_t* newcomm, + int nranks, + int myrank, + int nId, + ncclUniqueId* commIds, + ncclConfig_t* config); +/*! @cond include_hidden */ +ncclResult_t +pncclCommInitRankScalable(ncclComm_t* newcomm, + int nranks, + int myrank, + int nId, + ncclUniqueId* commIds, + ncclConfig_t* config); +/*! @endcond */ + /*! @defgroup rccl_api_errcheck Error Checking Calls - @brief API calls that check for errors + @details API calls that check for errors @{ */ /*! @brief Returns a string for each result code. @@ -302,6 +407,7 @@ pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclCon @return String containing description of result code. @param[in] result Result code to get description for */ +/* Returns a string for each error code. */ const char* ncclGetErrorString(ncclResult_t result); /*! @cond include_hidden */ @@ -317,6 +423,14 @@ const char* pncclGetLastError(ncclComm_t comm); /*! @endcond */ +/* Reload environment variables that determine logging. */ +void +ncclResetDebugInit(); +/*! @cond include_hidden */ +void +pncclResetDebugInit(); +/*! @endcond */ + /*! @brief Checks whether the comm has encountered any asynchronous errors @details Query whether the provided communicator has encountered any asynchronous errors @return Result code. See @ref rccl_result_code for more details. @@ -332,7 +446,7 @@ pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); /*! @} */ /*! @defgroup rccl_api_comminfo Communicator Information - @brief API calls that query communicator information + @details API calls that query communicator information @{ */ /*! @brief Gets the number of ranks in the communicator clique. @@ -392,8 +506,24 @@ ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle); /*! @endcond */ +/* Register memory window */ +ncclResult_t +ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags); +/*! @cond include_hidden */ +ncclResult_t +pncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags); +/*! @endcond */ + +/* Deregister symmetric memory */ +ncclResult_t +ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win); +/*! @cond include_hidden */ +ncclResult_t +pncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win); +/*! @endcond */ + /*! @defgroup rccl_api_enumerations API Enumerations - @brief Enumerations used by collective communication calls + @details Enumerations used by collective communication calls @{ */ /*! @brief Dummy reduction enumeration @@ -429,34 +559,29 @@ typedef enum @details Enumeration of the various supported datatype */ typedef enum { - ncclInt8 = 0, - ncclChar = 0, - ncclUint8 = 1, - ncclInt32 = 2, - ncclInt = 2, - ncclUint32 = 3, - ncclInt64 = 4, - ncclUint64 = 5, - ncclFloat16 = 6, - ncclHalf = 6, - ncclFloat32 = 7, - ncclFloat = 7, - ncclFloat64 = 8, - ncclDouble = 8, - ncclBfloat16 = 9, -#if defined(RCCL_FLOAT8) - ncclFp8E4M3 = 10, - ncclFp8E5M2 = 11, - ncclNumTypes = 12 + ncclInt8 = 0, + ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, + ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, + ncclHalf = 6, + ncclFloat32 = 7, + ncclFloat = 7, + ncclFloat64 = 8, + ncclDouble = 8, + ncclBfloat16 = 9, + ncclFloat8e4m3 = 10, + ncclFloat8e5m2 = 11, + ncclNumTypes = 12 } ncclDataType_t; -#else - ncclNumTypes = 10 -} ncclDataType_t; -#endif /*! @} */ /*! @defgroup rccl_api_custom_redop Custom Reduction Operator - @brief API calls relating to creation/destroying custom reduction operator + @details API calls relating to creation/destroying custom reduction operator that pre-multiplies local source arrays prior to reduction @{ */ @@ -518,7 +643,7 @@ pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); /*! @} */ /*! @defgroup rccl_collective_api Collective Communication Operations - @brief Collective communication operations must be called separately for each + @details Collective communication operations must be called separately for each communicator in a communicator clique. They return when operations have been enqueued on the HIP stream. @@ -662,6 +787,41 @@ pncclAllReduce(const void* sendbuff, hipStream_t stream); /*! @endcond */ +/*! @brief All-Reduce-with-Bias + @details Reduces data arrays of length *count* in *sendbuff* using *op* operation, and + leaves identical copies of result on each *recvbuff*. + In-place operation will happen if sendbuff == recvbuff. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Input data array to reduce + @param[out] recvbuff Data array to store reduced result array + @param[in] count Number of elements in data buffer + @param[in] datatype Data buffer element datatype + @param[in] op Reduction operator + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on + @param[in] acc Bias data array to reduce */ +ncclResult_t +ncclAllReduceWithBias(const void* sendbuff, + void* recvbuff, + size_t count, + ncclDataType_t datatype, + ncclRedOp_t op, + ncclComm_t comm, + hipStream_t stream, + const void* acc); +/*! @cond include_hidden */ +ncclResult_t +pncclAllReduceWithBias(const void* sendbuff, + void* recvbuff, + size_t count, + ncclDataType_t datatype, + ncclRedOp_t op, + ncclComm_t comm, + hipStream_t stream, + const void* acc); +/*! @endcond */ + /*! @brief Reduce-Scatter @details Reduces data in *sendbuff* using *op* operation and leaves reduced result scattered over the devices so that *recvbuff* on rank i will contain the i-th @@ -937,13 +1097,14 @@ pncclAllToAllv(const void* sendbuff, /*! @} */ /*! @defgroup msccl_api MSCCL Algorithm - @brief API calls relating to the optional MSCCL algorithm datapath + @details API calls relating to the optional MSCCL algorithm datapath @{ */ /*! @brief Opaque handle to MSCCL algorithm */ typedef int mscclAlgoHandle_t; /*! @brief MSCCL Load Algorithm + @deprecated This function has been removed from the public API. @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return its handle via mscclAlgoHandle. This API is expected to be called by MSCCL scheduler instead of end users. @@ -960,6 +1121,7 @@ pmscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle /*! @endcond */ /*! @brief MSCCL Run Algorithm + @deprecated This function has been removed from the public API. @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter list merges all possible parameters required by different operations as this is a general-purposed API. This API is expected to be called by MSCCL @@ -1032,7 +1194,7 @@ pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); /*! @} */ /*! @defgroup rccl_group_api Group semantics - @brief When managing multiple GPUs from a single thread, and since RCCL collective + @details When managing multiple GPUs from a single thread, and since RCCL collective calls may perform inter-CPU synchronization, we need to "group" calls for different ranks/devices into a single call. @@ -1076,6 +1238,16 @@ pncclGroupEnd(); /*! @endcond */ /*! @} */ +/* + * Group Simulate End + * + * Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct. + */ +ncclResult_t +ncclGroupSimulateEnd(ncclSimInfo_t* simInfo); +ncclResult_t +pncclGroupSimulateEnd(ncclSimInfo_t* simInfo); + #ifdef __cplusplus } // end extern "C" #endif diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/abi.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/abi.cpp index a177c5281c..19732fd2cd 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/abi.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/abi.cpp @@ -73,12 +73,18 @@ ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommRegister_fn, 35) ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommDeregister_fn, 36) #if RCCL_API_TRACE_VERSION_PATCH >= 1 ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclAllReduceWithBias_fn, 37) +#elif RCCL_API_TRACE_VERSION_PATCH >= 2 +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommShrink_fn, 38) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommWindowRegister_fn, 39) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommWindowDeregister_fn, 40) #endif #if RCCL_API_TRACE_VERSION_PATCH == 0 ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 37) #elif RCCL_API_TRACE_VERSION_PATCH == 1 ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 38) +#elif RCCL_API_TRACE_VERSION_PATCH == 2 +ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 41) #else INTERNAL_CI_ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 0) #endif diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/rccl.def.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/rccl.def.cpp index 324ad8c0f9..1b9cdf8234 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/rccl.def.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/rccl.def.cpp @@ -105,6 +105,11 @@ RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_nc #if RCCL_API_TRACE_VERSION_PATCH >= 1 RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_ncclAllReduceWithBias, ncclAllReduceWithBias, ncclAllReduceWithBias_fn, sendbuff, recvbuff, count, datatype, op, comm, stream, acc) #endif +#if RCCL_API_TRACE_VERSION_PATCH >= 2 +RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_ncclCommShrink, ncclCommShrink, ncclCommShrink_fn, comm, excludeRanksList, excludeRanksCount, newcomm, config, shrinkFlags) +RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_ncclCommWindowRegister, ncclCommWindowRegister, ncclCommWindowRegister_fn, comm, buff, size, win, winFlags) +RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_ncclCommWindowDeregister, ncclCommWindowDeregister, ncclCommWindowDeregister_fn, comm, win) +#endif #else # error \ diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp index 215c9fcd4b..092df0b9f1 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp @@ -1160,21 +1160,72 @@ rocprofiler_set_api_table(const char* name, auto* rccl_api = static_cast(tables[0]); - // any internal modifications to the rcclApiFuncTable need to be done before we make the - // copy or else those modifications will be lost when RCCL API tracing is enabled - // because the RCCL API tracing invokes the function pointers from the copy below - rocprofiler::rccl::copy_table(rccl_api, lib_instance); + auto is_valid_rccl_dispatch_table = (rccl_api != nullptr); - // install rocprofiler API wrappers - rocprofiler::rccl::update_table(rccl_api); + // Runtime ABI validation for RCCL API dispatch table. + // + // NOTE: These checks are necessary because rocprofiler-sdk enforces ABI + // compatibility at compile time. If RCCL is rebuilt afterwards with an + // incorrect or mismatched dispatch table, compile-time checks are bypassed. +#if ROCPROFILER_SDK_COMPUTE_VERSION(RCCL_API_TRACE_VERSION_MAJOR, \ + 0, \ + RCCL_API_TRACE_VERSION_PATCH) >= 1 + // 1. For RCCL_API_TRACE_VERSION_PATCH = 1, ncclAllReduceWithBias_fn is expected + // to be the last entry (38th function) in the dispatch table. Its offset is + // therefore used as the canonical end of the table for patch 1. + // + // Problem: Some intermediate RCCL commits introduced new APIs *before* + // ncclAllReduceWithBias_fn without bumping the ABI patch version. That + // breaks the ABI contract with rocprofiler-sdk, because the table layout no + // longer matches what the SDK was compiled against. + // + // 2. This check prevents such mismatches at runtime: + // a. NCCL_VERSION_CODE < 22703 → indicates the first RCCL build was taken + // before the broken commits. + // b. rccl_api->size > offsetof(..., ncclAllReduceWithBias_fn) + sizeof(void*) + // → indicates the current RCCL dispatch table is larger than expected, + // meaning newer (broken) entries were inserted before the known last API. + // + // If both conditions are true, the dispatch table is invalid and tracing is + // disabled to avoid corrupt output. + if(is_valid_rccl_dispatch_table && NCCL_VERSION_CODE < 22703 && + rccl_api->size > offsetof(rcclApiFuncTable, ncclAllReduceWithBias_fn) + sizeof(void*)) + { + is_valid_rccl_dispatch_table = false; - // Tracing notifications the runtime has initialized - rocprofiler::runtime_init::initialize( - ROCPROFILER_RUNTIME_INITIALIZATION_RCCL, lib_version, lib_instance); + ROCP_CI_LOG(WARNING) << fmt::format( + "Invalid RCCL dispatch table: layout does not match the expected " + "rocprofiler-SDK ABI (RCCL API Trace v{}.{}.{}). " + "Tracing is disabled to prevent corrupted data. " + "Use a compatible RCCL version.", + RCCL_API_TRACE_VERSION_MAJOR, + 0, + RCCL_API_TRACE_VERSION_PATCH); + } +#endif + if(is_valid_rccl_dispatch_table) + { + // any internal modifications to the rcclApiFuncTable need to be done before we make + // the copy or else those modifications will be lost when RCCL API tracing is + // enabled because the RCCL API tracing invokes the function pointers from the copy + // below + rocprofiler::rccl::copy_table(rccl_api, lib_instance); - // allow tools to install API wrappers - rocprofiler::intercept_table::notify_intercept_table_registration( - ROCPROFILER_RCCL_TABLE, lib_version, lib_instance, std::make_tuple(rccl_api)); + // install rocprofiler API wrappers + rocprofiler::rccl::update_table(rccl_api); + + // Tracing notifications the runtime has initialized + rocprofiler::runtime_init::initialize( + ROCPROFILER_RUNTIME_INITIALIZATION_RCCL, lib_version, lib_instance); + + // allow tools to install API wrappers + rocprofiler::intercept_table::notify_intercept_table_registration( + ROCPROFILER_RCCL_TABLE, lib_version, lib_instance, std::make_tuple(rccl_api)); + } + else + { + ROCP_CI_LOG(WARNING) << "RCCL API tracing is disabled: dispatch table is invalid."; + } } else if(std::string_view{name} == "rocdecode") {