[rocprofiler-sdk][RCCL] RCCL New API changes for RCCL_API_TRACE_VERSION_PATCH = 2 (#985)

- Address build issue with RCCL sync with NCCL commit: ROCm/rccl@08a7be2
- Patch Version Bump-up PR: ROCm/rccl#1916
Dieser Commit ist enthalten in:
Venkateshwar Reddy Kandula
2025-09-30 11:42:42 -05:00
committet von GitHub
Ursprung d1ee1f0cba
Commit c441a87a00
8 geänderte Dateien mit 350 neuen und 62 gelöschten Zeilen
@@ -1140,11 +1140,18 @@ ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclCommDeregister);
#if RCCL_API_TRACE_VERSION_PATCH >= 1
ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclAllReduceWithBias);
#endif
#if RCCL_API_TRACE_VERSION_PATCH >= 2
ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclCommShrink);
ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclCommWindowRegister);
ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclCommWindowDeregister);
#endif
#if RCCL_API_TRACE_VERSION_PATCH == 0
static_assert(ROCPROFILER_RCCL_API_ID_LAST == 37);
#elif RCCL_API_TRACE_VERSION_PATCH == 1
static_assert(ROCPROFILER_RCCL_API_ID_LAST == 38);
#elif RCCL_API_TRACE_VERSION_PATCH == 2
static_assert(ROCPROFILER_RCCL_API_ID_LAST == 41);
#else
# if !defined(ROCPROFILER_UNSAFE_NO_VERSION_CHECK) && \
(defined(ROCPROFILER_CI) && ROCPROFILER_CI > 0)
@@ -330,6 +330,30 @@ typedef union rocprofiler_rccl_api_args_t
const void* acc;
} ncclAllReduceWithBias;
#endif
#if RCCL_API_TRACE_VERSION_PATCH >= 2
struct
{
ncclComm_t comm;
int* excludeRanksList;
int excludeRanksCount;
ncclComm_t* newcomm;
ncclConfig_t* config;
int shrinkFlags;
} ncclCommShrink;
struct
{
ncclComm_t comm;
void* buff;
size_t size;
ncclWindow_t* win;
int winFlags;
} ncclCommWindowRegister;
struct
{
ncclComm_t comm;
ncclWindow_t win;
} ncclCommWindowDeregister;
#endif
} rocprofiler_rccl_api_args_t;
ROCPROFILER_EXTERN_C_FINI
@@ -70,6 +70,11 @@ typedef enum rocprofiler_rccl_api_id_t // NOLINT(performance-enum-size)
ROCPROFILER_RCCL_API_ID_ncclCommDeregister,
#if RCCL_API_TRACE_VERSION_PATCH >= 1
ROCPROFILER_RCCL_API_ID_ncclAllReduceWithBias,
#endif
#if RCCL_API_TRACE_VERSION_PATCH >= 2
ROCPROFILER_RCCL_API_ID_ncclCommShrink,
ROCPROFILER_RCCL_API_ID_ncclCommWindowRegister,
ROCPROFILER_RCCL_API_ID_ncclCommWindowDeregister,
#endif
ROCPROFILER_RCCL_API_ID_LAST,
} rocprofiler_rccl_api_id_t;
@@ -47,7 +47,7 @@
#define RCCL_API_TRACE_VERSION_MAJOR 0
// should be increased every time new members are added to existing dispatch tables
#define RCCL_API_TRACE_VERSION_PATCH 1
#define RCCL_API_TRACE_VERSION_PATCH 2
#if !defined(RCCL_EXTERN_C_INIT)
# ifdef __cplusplus
@@ -182,6 +182,13 @@ typedef ncclResult_t (*ncclCommDestroy_fn_t)(ncclComm_t comm);
typedef ncclResult_t (*ncclCommAbort_fn_t)(ncclComm_t comm);
typedef ncclResult_t (*ncclCommShrink_fn_t)(ncclComm_t comm,
int* excludeRanksList,
int excludeRanksCount,
ncclComm_t* newcomm,
ncclConfig_t* config,
int shrinkFlags);
typedef ncclResult_t (*ncclCommSplit_fn_t)(ncclComm_t comm,
int color,
int key,
@@ -232,6 +239,14 @@ typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm,
typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, void* handle);
typedef ncclResult_t (*ncclCommWindowRegister_fn_t)(ncclComm_t comm,
void* buff,
size_t size,
ncclWindow_t* win,
int winFlags);
typedef ncclResult_t (*ncclCommWindowDeregister_fn_t)(ncclComm_t comm, ncclWindow_t win);
typedef struct rcclApiFuncTable
{
uint64_t size;
@@ -273,6 +288,9 @@ typedef struct rcclApiFuncTable
ncclCommRegister_fn_t ncclCommRegister_fn;
ncclCommDeregister_fn_t ncclCommDeregister_fn;
ncclAllReduceWithBias_fn_t ncclAllReduceWithBias_fn;
ncclCommShrink_fn_t ncclCommShrink_fn;
ncclCommWindowRegister_fn_t ncclCommWindowRegister_fn;
ncclCommWindowDeregister_fn_t ncclCommWindowDeregister_fn;
} rcclApiFuncTable;
RCCL_EXTERN_C_FINI
@@ -5,7 +5,6 @@
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_H_
#define NCCL_H_
@@ -13,18 +12,19 @@
#include <hip/hip_runtime.h>
#define NCCL_MAJOR 2
#define NCCL_MINOR 20
#define NCCL_PATCH 5
#define NCCL_MINOR 27
#define NCCL_PATCH 3
#define NCCL_SUFFIX ""
#define NCCL_VERSION_CODE 22005
#define NCCL_VERSION_CODE 22703
#define NCCL_VERSION(X, Y, Z) \
(((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z))
#define RCCL_BFLOAT16 1
#define RCCL_FLOAT8 1
#define RCCL_GATHER_SCATTER 1
#define RCCL_ALLTOALLV 1
#define RCCL_BFLOAT16 1
#define RCCL_FLOAT8 1
#define RCCL_GATHER_SCATTER 1
#define RCCL_ALLTOALLV 1
#define RCCL_ALLREDUCE_WITH_BIAS 1
#ifdef __cplusplus
extern "C" {
@@ -35,7 +35,8 @@ extern "C" {
/*! @brief Opaque handle to communicator
@details A communicator contains information required to facilitate collective communications
calls */
typedef const struct ncclComm* ncclComm_t;
typedef struct ncclComm* ncclComm_t;
typedef struct ncclWindow* ncclWindow_t;
#define NCCL_COMM_NULL NULL
#define NCCL_UNIQUE_ID_BYTES 128
@@ -47,7 +48,7 @@ typedef struct
} ncclUniqueId;
/*! @defgroup rccl_result_code Result Codes
@brief The various result codes that RCCL API calls may return
@details The various result codes that RCCL API calls may return
@{ */
/*! @brief Result type
@@ -69,17 +70,29 @@ typedef enum
#define NCCL_CONFIG_UNDEF_INT INT_MIN
#define NCCL_CONFIG_UNDEF_PTR NULL
#define NCCL_SPLIT_NOCOLOR -1
#define NCCL_UNDEF_FLOAT -1.0f
/* Window Registration flags */
#define NCCL_WIN_DEFAULT 0x00
#define NCCL_WIN_COLL_SYMMETRIC 0x01
/* NCCL performance policy */
#define NCCL_CTA_POLICY_DEFAULT 0x00
#define NCCL_CTA_POLICY_EFFICIENCY 0x01
/* ncclCommShrink flags*/
#define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */
#define NCCL_SHRINK_ABORT \
0x01 /* First, terminate ongoing parent operations, and then shrink the parent communicator */
/*! @defgroup rccl_config_type Communicator Configuration
@brief Structure that allows for customizing Communicator behavior via
@details Structure that allows for customizing Communicator behavior via
ncclCommInitRankConfig
@{ */
/**
* @defgroup Communicator configuration
* @brief Users can assign value to attributes to specify the behavior of a communicator.
*/
typedef struct ncclConfig_v21700
/*! @brief Communicator configuration
@details Users can assign value to attributes to specify the behavior of a communicator */
typedef struct ncclConfig_v22700
{
/* attributes that users should never touch. */
size_t size; /*!< Should not be touched */
@@ -92,6 +105,12 @@ typedef struct ncclConfig_v21700
int maxCTAs; /*!< Maximum number of cooperative thread arrays (blocks) */
const char* netName; /*!< Force NCCL to use a specfic network */
int splitShare; /*!< Allow communicators to share resources */
int trafficClass; /*!< Traffic class*/
const char* commName; /*!< Name of the communicator*/
int collnetEnable; /*!< Check for collnet enablement*/
int CTAPolicy; /*!< CTA Policy*/
int shrinkShare; /*!< Shrink size*/
int nvlsCTAs; /*!< Number of NVLS cooperative thread arrays (blocks)*/
} ncclConfig_t;
/* Config initializer must be assigned to initialize config structure when it is created.
@@ -106,10 +125,35 @@ typedef struct ncclConfig_v21700
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
NCCL_CONFIG_UNDEF_PTR, /* netName */ \
NCCL_CONFIG_UNDEF_INT /* splitShare */ \
NCCL_CONFIG_UNDEF_INT, /* splitShare */ \
NCCL_CONFIG_UNDEF_INT, /* trafficClass */ \
NCCL_CONFIG_UNDEF_PTR, /* commName */ \
NCCL_CONFIG_UNDEF_INT, /* collnetEnable */ \
NCCL_CONFIG_UNDEF_INT, /* CTAPolicy */ \
NCCL_CONFIG_UNDEF_INT, /* shrinkShare */ \
NCCL_CONFIG_UNDEF_INT, /* nvlsCTAs */ \
}
/*! @} */
/* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
typedef struct ncclSimInfo_v22200
{
size_t size;
unsigned int magic;
unsigned int version;
float estimatedTime;
} ncclSimInfo_t;
/* NCCL_SIM_INFO_INITIALIZER must be assigned to initialize simInfo structure when it is created.
* Not initialized simInfo will result in NCCL error. */
#define NCCL_SIM_INFO_INITIALIZER \
{ \
sizeof(ncclSimInfo_t), /* size */ \
0x74685283, /* magic */ \
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
NCCL_UNDEF_FLOAT /* estimated time */ \
}
/* NCCL malloc and free function for all types of NCCL optimizations
* (e.g. user buffer registration). The actual allocated size might
* be larger than requested due to granularity requirement. */
@@ -124,7 +168,7 @@ ncclResult_t
pncclMemFree(void* ptr);
/*! @defgroup rccl_api_version Version Information
@brief API call that returns RCCL version
@details API call that returns RCCL version
@{ */
/*! @brief Return the RCCL_VERSION_CODE of RCCL in the supplied integer.
@@ -142,7 +186,7 @@ pncclGetVersion(int* version);
/*! @} */
/*! @defgroup rccl_api_communicator Communicator Initialization/Destruction
@brief API calls that operate on communicators.
@details API calls that operate on communicators.
Communicators objects are used to launch collective communication
operations. Unique ranks between 0 and N-1 must be assigned to
each HIP device participating in the same Communicator.
@@ -293,8 +337,69 @@ pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclCon
/*! @endcond */
/*! @} */
/*! @brief Shrink existing communicator.
@details Ranks in excludeRanksList will be removed form the existing communicator.
Within the new communicator, ranks will be re-ordered to fill the gap of removed
ones. If config is NULL, the new communicator will inherit the original communicator's
configuration. The flag enables NCCL to adapt to various states of the parent communicator, see
NCCL_SHRINK flags.
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Original communicator object for this rank
@param[in] excludeRanksList List of ranks to be exluded
@param[in] excludeRanksCount Number of ranks to be excluded
@param[out] newcomm Pointer to new communicator
@param[in] config Config file for new communicator. May be NULL to inherit from
comm
@param[in] shrinkFlags Flag to adapt to various states of the parent communicator
(see NCCL_SHRINK flags)*/
ncclResult_t
ncclCommShrink(ncclComm_t comm,
int* excludeRanksList,
int excludeRanksCount,
ncclComm_t* newcomm,
ncclConfig_t* config,
int shrinkFlags);
ncclResult_t
pncclCommShrink(ncclComm_t comm,
int* excludeRanksList,
int excludeRanksCount,
ncclComm_t* newcomm,
ncclConfig_t* config,
int shrinkFlags);
/*! @brief Creates a new communicator (multi thread/process version), similar to
ncclCommInitRankConfig.
@details Allows to use more than one ncclUniqueId (up to one per rank),
indicated by nId, to accelerate the init operation.
The number of ncclUniqueIds and their order must be the same for every rank.
@return Result code. See @ref rccl_result_code for more details.
@param[out] newcomm Pointer to new communicator
@param[in] nranks Total number of ranks participating in this communicator
@param[in] myrank Current rank
@param[in] nId Number of unique IDs
@param[in] commIds List of unique IDs
@param[in] config Config file for new communicator. May be NULL to inherit from comm */
ncclResult_t
ncclCommInitRankScalable(ncclComm_t* newcomm,
int nranks,
int myrank,
int nId,
ncclUniqueId* commIds,
ncclConfig_t* config);
/*! @cond include_hidden */
ncclResult_t
pncclCommInitRankScalable(ncclComm_t* newcomm,
int nranks,
int myrank,
int nId,
ncclUniqueId* commIds,
ncclConfig_t* config);
/*! @endcond */
/*! @defgroup rccl_api_errcheck Error Checking Calls
@brief API calls that check for errors
@details API calls that check for errors
@{ */
/*! @brief Returns a string for each result code.
@@ -302,6 +407,7 @@ pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclCon
@return String containing description of result code.
@param[in] result Result code to get description for */
/* Returns a string for each error code. */
const char*
ncclGetErrorString(ncclResult_t result);
/*! @cond include_hidden */
@@ -317,6 +423,14 @@ const char*
pncclGetLastError(ncclComm_t comm);
/*! @endcond */
/* Reload environment variables that determine logging. */
void
ncclResetDebugInit();
/*! @cond include_hidden */
void
pncclResetDebugInit();
/*! @endcond */
/*! @brief Checks whether the comm has encountered any asynchronous errors
@details Query whether the provided communicator has encountered any asynchronous errors
@return Result code. See @ref rccl_result_code for more details.
@@ -332,7 +446,7 @@ pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
/*! @} */
/*! @defgroup rccl_api_comminfo Communicator Information
@brief API calls that query communicator information
@details API calls that query communicator information
@{ */
/*! @brief Gets the number of ranks in the communicator clique.
@@ -392,8 +506,24 @@ ncclResult_t
pncclCommDeregister(const ncclComm_t comm, void* handle);
/*! @endcond */
/* Register memory window */
ncclResult_t
ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
/*! @cond include_hidden */
ncclResult_t
pncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
/*! @endcond */
/* Deregister symmetric memory */
ncclResult_t
ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
/*! @cond include_hidden */
ncclResult_t
pncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
/*! @endcond */
/*! @defgroup rccl_api_enumerations API Enumerations
@brief Enumerations used by collective communication calls
@details Enumerations used by collective communication calls
@{ */
/*! @brief Dummy reduction enumeration
@@ -429,34 +559,29 @@ typedef enum
@details Enumeration of the various supported datatype */
typedef enum
{
ncclInt8 = 0,
ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2,
ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6,
ncclHalf = 6,
ncclFloat32 = 7,
ncclFloat = 7,
ncclFloat64 = 8,
ncclDouble = 8,
ncclBfloat16 = 9,
#if defined(RCCL_FLOAT8)
ncclFp8E4M3 = 10,
ncclFp8E5M2 = 11,
ncclNumTypes = 12
ncclInt8 = 0,
ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2,
ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6,
ncclHalf = 6,
ncclFloat32 = 7,
ncclFloat = 7,
ncclFloat64 = 8,
ncclDouble = 8,
ncclBfloat16 = 9,
ncclFloat8e4m3 = 10,
ncclFloat8e5m2 = 11,
ncclNumTypes = 12
} ncclDataType_t;
#else
ncclNumTypes = 10
} ncclDataType_t;
#endif
/*! @} */
/*! @defgroup rccl_api_custom_redop Custom Reduction Operator
@brief API calls relating to creation/destroying custom reduction operator
@details API calls relating to creation/destroying custom reduction operator
that pre-multiplies local source arrays prior to reduction
@{ */
@@ -518,7 +643,7 @@ pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
/*! @} */
/*! @defgroup rccl_collective_api Collective Communication Operations
@brief Collective communication operations must be called separately for each
@details Collective communication operations must be called separately for each
communicator in a communicator clique.
They return when operations have been enqueued on the HIP stream.
@@ -662,6 +787,41 @@ pncclAllReduce(const void* sendbuff,
hipStream_t stream);
/*! @endcond */
/*! @brief All-Reduce-with-Bias
@details Reduces data arrays of length *count* in *sendbuff* using *op* operation, and
leaves identical copies of result on each *recvbuff*.
In-place operation will happen if sendbuff == recvbuff.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Input data array to reduce
@param[out] recvbuff Data array to store reduced result array
@param[in] count Number of elements in data buffer
@param[in] datatype Data buffer element datatype
@param[in] op Reduction operator
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on
@param[in] acc Bias data array to reduce */
ncclResult_t
ncclAllReduceWithBias(const void* sendbuff,
void* recvbuff,
size_t count,
ncclDataType_t datatype,
ncclRedOp_t op,
ncclComm_t comm,
hipStream_t stream,
const void* acc);
/*! @cond include_hidden */
ncclResult_t
pncclAllReduceWithBias(const void* sendbuff,
void* recvbuff,
size_t count,
ncclDataType_t datatype,
ncclRedOp_t op,
ncclComm_t comm,
hipStream_t stream,
const void* acc);
/*! @endcond */
/*! @brief Reduce-Scatter
@details Reduces data in *sendbuff* using *op* operation and leaves reduced result
scattered over the devices so that *recvbuff* on rank i will contain the i-th
@@ -937,13 +1097,14 @@ pncclAllToAllv(const void* sendbuff,
/*! @} */
/*! @defgroup msccl_api MSCCL Algorithm
@brief API calls relating to the optional MSCCL algorithm datapath
@details API calls relating to the optional MSCCL algorithm datapath
@{ */
/*! @brief Opaque handle to MSCCL algorithm */
typedef int mscclAlgoHandle_t;
/*! @brief MSCCL Load Algorithm
@deprecated This function has been removed from the public API.
@details Load MSCCL algorithm file specified in mscclAlgoFilePath and return
its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
scheduler instead of end users.
@@ -960,6 +1121,7 @@ pmscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle
/*! @endcond */
/*! @brief MSCCL Run Algorithm
@deprecated This function has been removed from the public API.
@details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter
list merges all possible parameters required by different operations as this
is a general-purposed API. This API is expected to be called by MSCCL
@@ -1032,7 +1194,7 @@ pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
/*! @} */
/*! @defgroup rccl_group_api Group semantics
@brief When managing multiple GPUs from a single thread, and since RCCL collective
@details When managing multiple GPUs from a single thread, and since RCCL collective
calls may perform inter-CPU synchronization, we need to "group" calls for
different ranks/devices into a single call.
@@ -1076,6 +1238,16 @@ pncclGroupEnd();
/*! @endcond */
/*! @} */
/*
* Group Simulate End
*
* Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct.
*/
ncclResult_t
ncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
ncclResult_t
pncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
#ifdef __cplusplus
} // end extern "C"
#endif
@@ -73,12 +73,18 @@ ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommRegister_fn, 35)
ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommDeregister_fn, 36)
#if RCCL_API_TRACE_VERSION_PATCH >= 1
ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclAllReduceWithBias_fn, 37)
#elif RCCL_API_TRACE_VERSION_PATCH >= 2
ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommShrink_fn, 38)
ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommWindowRegister_fn, 39)
ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommWindowDeregister_fn, 40)
#endif
#if RCCL_API_TRACE_VERSION_PATCH == 0
ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 37)
#elif RCCL_API_TRACE_VERSION_PATCH == 1
ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 38)
#elif RCCL_API_TRACE_VERSION_PATCH == 2
ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 41)
#else
INTERNAL_CI_ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 0)
#endif
@@ -105,6 +105,11 @@ RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_nc
#if RCCL_API_TRACE_VERSION_PATCH >= 1
RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_ncclAllReduceWithBias, ncclAllReduceWithBias, ncclAllReduceWithBias_fn, sendbuff, recvbuff, count, datatype, op, comm, stream, acc)
#endif
#if RCCL_API_TRACE_VERSION_PATCH >= 2
RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_ncclCommShrink, ncclCommShrink, ncclCommShrink_fn, comm, excludeRanksList, excludeRanksCount, newcomm, config, shrinkFlags)
RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_ncclCommWindowRegister, ncclCommWindowRegister, ncclCommWindowRegister_fn, comm, buff, size, win, winFlags)
RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_ncclCommWindowDeregister, ncclCommWindowDeregister, ncclCommWindowDeregister_fn, comm, win)
#endif
#else
# error \
@@ -1160,21 +1160,72 @@ rocprofiler_set_api_table(const char* name,
auto* rccl_api = static_cast<rcclApiFuncTable*>(tables[0]);
// any internal modifications to the rcclApiFuncTable need to be done before we make the
// copy or else those modifications will be lost when RCCL API tracing is enabled
// because the RCCL API tracing invokes the function pointers from the copy below
rocprofiler::rccl::copy_table(rccl_api, lib_instance);
auto is_valid_rccl_dispatch_table = (rccl_api != nullptr);
// install rocprofiler API wrappers
rocprofiler::rccl::update_table(rccl_api);
// Runtime ABI validation for RCCL API dispatch table.
//
// NOTE: These checks are necessary because rocprofiler-sdk enforces ABI
// compatibility at compile time. If RCCL is rebuilt afterwards with an
// incorrect or mismatched dispatch table, compile-time checks are bypassed.
#if ROCPROFILER_SDK_COMPUTE_VERSION(RCCL_API_TRACE_VERSION_MAJOR, \
0, \
RCCL_API_TRACE_VERSION_PATCH) >= 1
// 1. For RCCL_API_TRACE_VERSION_PATCH = 1, ncclAllReduceWithBias_fn is expected
// to be the last entry (38th function) in the dispatch table. Its offset is
// therefore used as the canonical end of the table for patch 1.
//
// Problem: Some intermediate RCCL commits introduced new APIs *before*
// ncclAllReduceWithBias_fn without bumping the ABI patch version. That
// breaks the ABI contract with rocprofiler-sdk, because the table layout no
// longer matches what the SDK was compiled against.
//
// 2. This check prevents such mismatches at runtime:
// a. NCCL_VERSION_CODE < 22703 → indicates the first RCCL build was taken
// before the broken commits.
// b. rccl_api->size > offsetof(..., ncclAllReduceWithBias_fn) + sizeof(void*)
// → indicates the current RCCL dispatch table is larger than expected,
// meaning newer (broken) entries were inserted before the known last API.
//
// If both conditions are true, the dispatch table is invalid and tracing is
// disabled to avoid corrupt output.
if(is_valid_rccl_dispatch_table && NCCL_VERSION_CODE < 22703 &&
rccl_api->size > offsetof(rcclApiFuncTable, ncclAllReduceWithBias_fn) + sizeof(void*))
{
is_valid_rccl_dispatch_table = false;
// Tracing notifications the runtime has initialized
rocprofiler::runtime_init::initialize(
ROCPROFILER_RUNTIME_INITIALIZATION_RCCL, lib_version, lib_instance);
ROCP_CI_LOG(WARNING) << fmt::format(
"Invalid RCCL dispatch table: layout does not match the expected "
"rocprofiler-SDK ABI (RCCL API Trace v{}.{}.{}). "
"Tracing is disabled to prevent corrupted data. "
"Use a compatible RCCL version.",
RCCL_API_TRACE_VERSION_MAJOR,
0,
RCCL_API_TRACE_VERSION_PATCH);
}
#endif
if(is_valid_rccl_dispatch_table)
{
// any internal modifications to the rcclApiFuncTable need to be done before we make
// the copy or else those modifications will be lost when RCCL API tracing is
// enabled because the RCCL API tracing invokes the function pointers from the copy
// below
rocprofiler::rccl::copy_table(rccl_api, lib_instance);
// allow tools to install API wrappers
rocprofiler::intercept_table::notify_intercept_table_registration(
ROCPROFILER_RCCL_TABLE, lib_version, lib_instance, std::make_tuple(rccl_api));
// install rocprofiler API wrappers
rocprofiler::rccl::update_table(rccl_api);
// Tracing notifications the runtime has initialized
rocprofiler::runtime_init::initialize(
ROCPROFILER_RUNTIME_INITIALIZATION_RCCL, lib_version, lib_instance);
// allow tools to install API wrappers
rocprofiler::intercept_table::notify_intercept_table_registration(
ROCPROFILER_RCCL_TABLE, lib_version, lib_instance, std::make_tuple(rccl_api));
}
else
{
ROCP_CI_LOG(WARNING) << "RCCL API tracing is disabled: dispatch table is invalid.";
}
}
else if(std::string_view{name} == "rocdecode")
{