[rocprofiler-sdk][RCCL] RCCL New API changes for RCCL_API_TRACE_VERSION_PATCH = 2 (#985)
- Address build issue with RCCL sync with NCCL commit: ROCm/rccl@08a7be2 - Patch Version Bump-up PR: ROCm/rccl#1916
Dieser Commit ist enthalten in:
committet von
GitHub
Ursprung
d1ee1f0cba
Commit
c441a87a00
@@ -1140,11 +1140,18 @@ ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclCommDeregister);
|
||||
#if RCCL_API_TRACE_VERSION_PATCH >= 1
|
||||
ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclAllReduceWithBias);
|
||||
#endif
|
||||
#if RCCL_API_TRACE_VERSION_PATCH >= 2
|
||||
ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclCommShrink);
|
||||
ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclCommWindowRegister);
|
||||
ROCPROFILER_ENUM_LABEL(ROCPROFILER_RCCL_API_ID_ncclCommWindowDeregister);
|
||||
#endif
|
||||
|
||||
#if RCCL_API_TRACE_VERSION_PATCH == 0
|
||||
static_assert(ROCPROFILER_RCCL_API_ID_LAST == 37);
|
||||
#elif RCCL_API_TRACE_VERSION_PATCH == 1
|
||||
static_assert(ROCPROFILER_RCCL_API_ID_LAST == 38);
|
||||
#elif RCCL_API_TRACE_VERSION_PATCH == 2
|
||||
static_assert(ROCPROFILER_RCCL_API_ID_LAST == 41);
|
||||
#else
|
||||
# if !defined(ROCPROFILER_UNSAFE_NO_VERSION_CHECK) && \
|
||||
(defined(ROCPROFILER_CI) && ROCPROFILER_CI > 0)
|
||||
|
||||
@@ -330,6 +330,30 @@ typedef union rocprofiler_rccl_api_args_t
|
||||
const void* acc;
|
||||
} ncclAllReduceWithBias;
|
||||
#endif
|
||||
#if RCCL_API_TRACE_VERSION_PATCH >= 2
|
||||
struct
|
||||
{
|
||||
ncclComm_t comm;
|
||||
int* excludeRanksList;
|
||||
int excludeRanksCount;
|
||||
ncclComm_t* newcomm;
|
||||
ncclConfig_t* config;
|
||||
int shrinkFlags;
|
||||
} ncclCommShrink;
|
||||
struct
|
||||
{
|
||||
ncclComm_t comm;
|
||||
void* buff;
|
||||
size_t size;
|
||||
ncclWindow_t* win;
|
||||
int winFlags;
|
||||
} ncclCommWindowRegister;
|
||||
struct
|
||||
{
|
||||
ncclComm_t comm;
|
||||
ncclWindow_t win;
|
||||
} ncclCommWindowDeregister;
|
||||
#endif
|
||||
} rocprofiler_rccl_api_args_t;
|
||||
|
||||
ROCPROFILER_EXTERN_C_FINI
|
||||
|
||||
@@ -70,6 +70,11 @@ typedef enum rocprofiler_rccl_api_id_t // NOLINT(performance-enum-size)
|
||||
ROCPROFILER_RCCL_API_ID_ncclCommDeregister,
|
||||
#if RCCL_API_TRACE_VERSION_PATCH >= 1
|
||||
ROCPROFILER_RCCL_API_ID_ncclAllReduceWithBias,
|
||||
#endif
|
||||
#if RCCL_API_TRACE_VERSION_PATCH >= 2
|
||||
ROCPROFILER_RCCL_API_ID_ncclCommShrink,
|
||||
ROCPROFILER_RCCL_API_ID_ncclCommWindowRegister,
|
||||
ROCPROFILER_RCCL_API_ID_ncclCommWindowDeregister,
|
||||
#endif
|
||||
ROCPROFILER_RCCL_API_ID_LAST,
|
||||
} rocprofiler_rccl_api_id_t;
|
||||
|
||||
+19
-1
@@ -47,7 +47,7 @@
|
||||
#define RCCL_API_TRACE_VERSION_MAJOR 0
|
||||
|
||||
// should be increased every time new members are added to existing dispatch tables
|
||||
#define RCCL_API_TRACE_VERSION_PATCH 1
|
||||
#define RCCL_API_TRACE_VERSION_PATCH 2
|
||||
|
||||
#if !defined(RCCL_EXTERN_C_INIT)
|
||||
# ifdef __cplusplus
|
||||
@@ -182,6 +182,13 @@ typedef ncclResult_t (*ncclCommDestroy_fn_t)(ncclComm_t comm);
|
||||
|
||||
typedef ncclResult_t (*ncclCommAbort_fn_t)(ncclComm_t comm);
|
||||
|
||||
typedef ncclResult_t (*ncclCommShrink_fn_t)(ncclComm_t comm,
|
||||
int* excludeRanksList,
|
||||
int excludeRanksCount,
|
||||
ncclComm_t* newcomm,
|
||||
ncclConfig_t* config,
|
||||
int shrinkFlags);
|
||||
|
||||
typedef ncclResult_t (*ncclCommSplit_fn_t)(ncclComm_t comm,
|
||||
int color,
|
||||
int key,
|
||||
@@ -232,6 +239,14 @@ typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm,
|
||||
|
||||
typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, void* handle);
|
||||
|
||||
typedef ncclResult_t (*ncclCommWindowRegister_fn_t)(ncclComm_t comm,
|
||||
void* buff,
|
||||
size_t size,
|
||||
ncclWindow_t* win,
|
||||
int winFlags);
|
||||
|
||||
typedef ncclResult_t (*ncclCommWindowDeregister_fn_t)(ncclComm_t comm, ncclWindow_t win);
|
||||
|
||||
typedef struct rcclApiFuncTable
|
||||
{
|
||||
uint64_t size;
|
||||
@@ -273,6 +288,9 @@ typedef struct rcclApiFuncTable
|
||||
ncclCommRegister_fn_t ncclCommRegister_fn;
|
||||
ncclCommDeregister_fn_t ncclCommDeregister_fn;
|
||||
ncclAllReduceWithBias_fn_t ncclAllReduceWithBias_fn;
|
||||
ncclCommShrink_fn_t ncclCommShrink_fn;
|
||||
ncclCommWindowRegister_fn_t ncclCommWindowRegister_fn;
|
||||
ncclCommWindowDeregister_fn_t ncclCommWindowDeregister_fn;
|
||||
} rcclApiFuncTable;
|
||||
|
||||
RCCL_EXTERN_C_FINI
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_H_
|
||||
#define NCCL_H_
|
||||
|
||||
@@ -13,18 +12,19 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#define NCCL_MAJOR 2
|
||||
#define NCCL_MINOR 20
|
||||
#define NCCL_PATCH 5
|
||||
#define NCCL_MINOR 27
|
||||
#define NCCL_PATCH 3
|
||||
#define NCCL_SUFFIX ""
|
||||
|
||||
#define NCCL_VERSION_CODE 22005
|
||||
#define NCCL_VERSION_CODE 22703
|
||||
#define NCCL_VERSION(X, Y, Z) \
|
||||
(((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z))
|
||||
|
||||
#define RCCL_BFLOAT16 1
|
||||
#define RCCL_FLOAT8 1
|
||||
#define RCCL_GATHER_SCATTER 1
|
||||
#define RCCL_ALLTOALLV 1
|
||||
#define RCCL_BFLOAT16 1
|
||||
#define RCCL_FLOAT8 1
|
||||
#define RCCL_GATHER_SCATTER 1
|
||||
#define RCCL_ALLTOALLV 1
|
||||
#define RCCL_ALLREDUCE_WITH_BIAS 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -35,7 +35,8 @@ extern "C" {
|
||||
/*! @brief Opaque handle to communicator
|
||||
@details A communicator contains information required to facilitate collective communications
|
||||
calls */
|
||||
typedef const struct ncclComm* ncclComm_t;
|
||||
typedef struct ncclComm* ncclComm_t;
|
||||
typedef struct ncclWindow* ncclWindow_t;
|
||||
#define NCCL_COMM_NULL NULL
|
||||
|
||||
#define NCCL_UNIQUE_ID_BYTES 128
|
||||
@@ -47,7 +48,7 @@ typedef struct
|
||||
} ncclUniqueId;
|
||||
|
||||
/*! @defgroup rccl_result_code Result Codes
|
||||
@brief The various result codes that RCCL API calls may return
|
||||
@details The various result codes that RCCL API calls may return
|
||||
@{ */
|
||||
|
||||
/*! @brief Result type
|
||||
@@ -69,17 +70,29 @@ typedef enum
|
||||
#define NCCL_CONFIG_UNDEF_INT INT_MIN
|
||||
#define NCCL_CONFIG_UNDEF_PTR NULL
|
||||
#define NCCL_SPLIT_NOCOLOR -1
|
||||
#define NCCL_UNDEF_FLOAT -1.0f
|
||||
|
||||
/* Window Registration flags */
|
||||
#define NCCL_WIN_DEFAULT 0x00
|
||||
#define NCCL_WIN_COLL_SYMMETRIC 0x01
|
||||
|
||||
/* NCCL performance policy */
|
||||
#define NCCL_CTA_POLICY_DEFAULT 0x00
|
||||
#define NCCL_CTA_POLICY_EFFICIENCY 0x01
|
||||
|
||||
/* ncclCommShrink flags*/
|
||||
#define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */
|
||||
#define NCCL_SHRINK_ABORT \
|
||||
0x01 /* First, terminate ongoing parent operations, and then shrink the parent communicator */
|
||||
|
||||
/*! @defgroup rccl_config_type Communicator Configuration
|
||||
@brief Structure that allows for customizing Communicator behavior via
|
||||
@details Structure that allows for customizing Communicator behavior via
|
||||
ncclCommInitRankConfig
|
||||
@{ */
|
||||
|
||||
/**
|
||||
* @defgroup Communicator configuration
|
||||
* @brief Users can assign value to attributes to specify the behavior of a communicator.
|
||||
*/
|
||||
typedef struct ncclConfig_v21700
|
||||
/*! @brief Communicator configuration
|
||||
@details Users can assign value to attributes to specify the behavior of a communicator */
|
||||
typedef struct ncclConfig_v22700
|
||||
{
|
||||
/* attributes that users should never touch. */
|
||||
size_t size; /*!< Should not be touched */
|
||||
@@ -92,6 +105,12 @@ typedef struct ncclConfig_v21700
|
||||
int maxCTAs; /*!< Maximum number of cooperative thread arrays (blocks) */
|
||||
const char* netName; /*!< Force NCCL to use a specfic network */
|
||||
int splitShare; /*!< Allow communicators to share resources */
|
||||
int trafficClass; /*!< Traffic class*/
|
||||
const char* commName; /*!< Name of the communicator*/
|
||||
int collnetEnable; /*!< Check for collnet enablement*/
|
||||
int CTAPolicy; /*!< CTA Policy*/
|
||||
int shrinkShare; /*!< Shrink size*/
|
||||
int nvlsCTAs; /*!< Number of NVLS cooperative thread arrays (blocks)*/
|
||||
} ncclConfig_t;
|
||||
|
||||
/* Config initializer must be assigned to initialize config structure when it is created.
|
||||
@@ -106,10 +125,35 @@ typedef struct ncclConfig_v21700
|
||||
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
|
||||
NCCL_CONFIG_UNDEF_PTR, /* netName */ \
|
||||
NCCL_CONFIG_UNDEF_INT /* splitShare */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* splitShare */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* trafficClass */ \
|
||||
NCCL_CONFIG_UNDEF_PTR, /* commName */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* collnetEnable */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* CTAPolicy */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* shrinkShare */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* nvlsCTAs */ \
|
||||
}
|
||||
/*! @} */
|
||||
|
||||
/* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
|
||||
typedef struct ncclSimInfo_v22200
|
||||
{
|
||||
size_t size;
|
||||
unsigned int magic;
|
||||
unsigned int version;
|
||||
float estimatedTime;
|
||||
} ncclSimInfo_t;
|
||||
|
||||
/* NCCL_SIM_INFO_INITIALIZER must be assigned to initialize simInfo structure when it is created.
|
||||
* Not initialized simInfo will result in NCCL error. */
|
||||
#define NCCL_SIM_INFO_INITIALIZER \
|
||||
{ \
|
||||
sizeof(ncclSimInfo_t), /* size */ \
|
||||
0x74685283, /* magic */ \
|
||||
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
|
||||
NCCL_UNDEF_FLOAT /* estimated time */ \
|
||||
}
|
||||
|
||||
/* NCCL malloc and free function for all types of NCCL optimizations
|
||||
* (e.g. user buffer registration). The actual allocated size might
|
||||
* be larger than requested due to granularity requirement. */
|
||||
@@ -124,7 +168,7 @@ ncclResult_t
|
||||
pncclMemFree(void* ptr);
|
||||
|
||||
/*! @defgroup rccl_api_version Version Information
|
||||
@brief API call that returns RCCL version
|
||||
@details API call that returns RCCL version
|
||||
@{ */
|
||||
|
||||
/*! @brief Return the RCCL_VERSION_CODE of RCCL in the supplied integer.
|
||||
@@ -142,7 +186,7 @@ pncclGetVersion(int* version);
|
||||
/*! @} */
|
||||
|
||||
/*! @defgroup rccl_api_communicator Communicator Initialization/Destruction
|
||||
@brief API calls that operate on communicators.
|
||||
@details API calls that operate on communicators.
|
||||
Communicators objects are used to launch collective communication
|
||||
operations. Unique ranks between 0 and N-1 must be assigned to
|
||||
each HIP device participating in the same Communicator.
|
||||
@@ -293,8 +337,69 @@ pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclCon
|
||||
/*! @endcond */
|
||||
/*! @} */
|
||||
|
||||
/*! @brief Shrink existing communicator.
|
||||
@details Ranks in excludeRanksList will be removed form the existing communicator.
|
||||
Within the new communicator, ranks will be re-ordered to fill the gap of removed
|
||||
ones. If config is NULL, the new communicator will inherit the original communicator's
|
||||
configuration. The flag enables NCCL to adapt to various states of the parent communicator, see
|
||||
NCCL_SHRINK flags.
|
||||
@return Result code. See @ref rccl_result_code for more details.
|
||||
|
||||
@param[in] comm Original communicator object for this rank
|
||||
@param[in] excludeRanksList List of ranks to be exluded
|
||||
@param[in] excludeRanksCount Number of ranks to be excluded
|
||||
@param[out] newcomm Pointer to new communicator
|
||||
@param[in] config Config file for new communicator. May be NULL to inherit from
|
||||
comm
|
||||
@param[in] shrinkFlags Flag to adapt to various states of the parent communicator
|
||||
(see NCCL_SHRINK flags)*/
|
||||
ncclResult_t
|
||||
ncclCommShrink(ncclComm_t comm,
|
||||
int* excludeRanksList,
|
||||
int excludeRanksCount,
|
||||
ncclComm_t* newcomm,
|
||||
ncclConfig_t* config,
|
||||
int shrinkFlags);
|
||||
ncclResult_t
|
||||
pncclCommShrink(ncclComm_t comm,
|
||||
int* excludeRanksList,
|
||||
int excludeRanksCount,
|
||||
ncclComm_t* newcomm,
|
||||
ncclConfig_t* config,
|
||||
int shrinkFlags);
|
||||
|
||||
/*! @brief Creates a new communicator (multi thread/process version), similar to
|
||||
ncclCommInitRankConfig.
|
||||
@details Allows to use more than one ncclUniqueId (up to one per rank),
|
||||
indicated by nId, to accelerate the init operation.
|
||||
The number of ncclUniqueIds and their order must be the same for every rank.
|
||||
@return Result code. See @ref rccl_result_code for more details.
|
||||
|
||||
@param[out] newcomm Pointer to new communicator
|
||||
@param[in] nranks Total number of ranks participating in this communicator
|
||||
@param[in] myrank Current rank
|
||||
@param[in] nId Number of unique IDs
|
||||
@param[in] commIds List of unique IDs
|
||||
@param[in] config Config file for new communicator. May be NULL to inherit from comm */
|
||||
ncclResult_t
|
||||
ncclCommInitRankScalable(ncclComm_t* newcomm,
|
||||
int nranks,
|
||||
int myrank,
|
||||
int nId,
|
||||
ncclUniqueId* commIds,
|
||||
ncclConfig_t* config);
|
||||
/*! @cond include_hidden */
|
||||
ncclResult_t
|
||||
pncclCommInitRankScalable(ncclComm_t* newcomm,
|
||||
int nranks,
|
||||
int myrank,
|
||||
int nId,
|
||||
ncclUniqueId* commIds,
|
||||
ncclConfig_t* config);
|
||||
/*! @endcond */
|
||||
|
||||
/*! @defgroup rccl_api_errcheck Error Checking Calls
|
||||
@brief API calls that check for errors
|
||||
@details API calls that check for errors
|
||||
@{ */
|
||||
|
||||
/*! @brief Returns a string for each result code.
|
||||
@@ -302,6 +407,7 @@ pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclCon
|
||||
@return String containing description of result code.
|
||||
|
||||
@param[in] result Result code to get description for */
|
||||
/* Returns a string for each error code. */
|
||||
const char*
|
||||
ncclGetErrorString(ncclResult_t result);
|
||||
/*! @cond include_hidden */
|
||||
@@ -317,6 +423,14 @@ const char*
|
||||
pncclGetLastError(ncclComm_t comm);
|
||||
/*! @endcond */
|
||||
|
||||
/* Reload environment variables that determine logging. */
|
||||
void
|
||||
ncclResetDebugInit();
|
||||
/*! @cond include_hidden */
|
||||
void
|
||||
pncclResetDebugInit();
|
||||
/*! @endcond */
|
||||
|
||||
/*! @brief Checks whether the comm has encountered any asynchronous errors
|
||||
@details Query whether the provided communicator has encountered any asynchronous errors
|
||||
@return Result code. See @ref rccl_result_code for more details.
|
||||
@@ -332,7 +446,7 @@ pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
|
||||
/*! @} */
|
||||
|
||||
/*! @defgroup rccl_api_comminfo Communicator Information
|
||||
@brief API calls that query communicator information
|
||||
@details API calls that query communicator information
|
||||
@{ */
|
||||
|
||||
/*! @brief Gets the number of ranks in the communicator clique.
|
||||
@@ -392,8 +506,24 @@ ncclResult_t
|
||||
pncclCommDeregister(const ncclComm_t comm, void* handle);
|
||||
/*! @endcond */
|
||||
|
||||
/* Register memory window */
|
||||
ncclResult_t
|
||||
ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
|
||||
/*! @cond include_hidden */
|
||||
ncclResult_t
|
||||
pncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
|
||||
/*! @endcond */
|
||||
|
||||
/* Deregister symmetric memory */
|
||||
ncclResult_t
|
||||
ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
|
||||
/*! @cond include_hidden */
|
||||
ncclResult_t
|
||||
pncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
|
||||
/*! @endcond */
|
||||
|
||||
/*! @defgroup rccl_api_enumerations API Enumerations
|
||||
@brief Enumerations used by collective communication calls
|
||||
@details Enumerations used by collective communication calls
|
||||
@{ */
|
||||
|
||||
/*! @brief Dummy reduction enumeration
|
||||
@@ -429,34 +559,29 @@ typedef enum
|
||||
@details Enumeration of the various supported datatype */
|
||||
typedef enum
|
||||
{
|
||||
ncclInt8 = 0,
|
||||
ncclChar = 0,
|
||||
ncclUint8 = 1,
|
||||
ncclInt32 = 2,
|
||||
ncclInt = 2,
|
||||
ncclUint32 = 3,
|
||||
ncclInt64 = 4,
|
||||
ncclUint64 = 5,
|
||||
ncclFloat16 = 6,
|
||||
ncclHalf = 6,
|
||||
ncclFloat32 = 7,
|
||||
ncclFloat = 7,
|
||||
ncclFloat64 = 8,
|
||||
ncclDouble = 8,
|
||||
ncclBfloat16 = 9,
|
||||
#if defined(RCCL_FLOAT8)
|
||||
ncclFp8E4M3 = 10,
|
||||
ncclFp8E5M2 = 11,
|
||||
ncclNumTypes = 12
|
||||
ncclInt8 = 0,
|
||||
ncclChar = 0,
|
||||
ncclUint8 = 1,
|
||||
ncclInt32 = 2,
|
||||
ncclInt = 2,
|
||||
ncclUint32 = 3,
|
||||
ncclInt64 = 4,
|
||||
ncclUint64 = 5,
|
||||
ncclFloat16 = 6,
|
||||
ncclHalf = 6,
|
||||
ncclFloat32 = 7,
|
||||
ncclFloat = 7,
|
||||
ncclFloat64 = 8,
|
||||
ncclDouble = 8,
|
||||
ncclBfloat16 = 9,
|
||||
ncclFloat8e4m3 = 10,
|
||||
ncclFloat8e5m2 = 11,
|
||||
ncclNumTypes = 12
|
||||
} ncclDataType_t;
|
||||
#else
|
||||
ncclNumTypes = 10
|
||||
} ncclDataType_t;
|
||||
#endif
|
||||
/*! @} */
|
||||
|
||||
/*! @defgroup rccl_api_custom_redop Custom Reduction Operator
|
||||
@brief API calls relating to creation/destroying custom reduction operator
|
||||
@details API calls relating to creation/destroying custom reduction operator
|
||||
that pre-multiplies local source arrays prior to reduction
|
||||
@{ */
|
||||
|
||||
@@ -518,7 +643,7 @@ pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||
/*! @} */
|
||||
|
||||
/*! @defgroup rccl_collective_api Collective Communication Operations
|
||||
@brief Collective communication operations must be called separately for each
|
||||
@details Collective communication operations must be called separately for each
|
||||
communicator in a communicator clique.
|
||||
|
||||
They return when operations have been enqueued on the HIP stream.
|
||||
@@ -662,6 +787,41 @@ pncclAllReduce(const void* sendbuff,
|
||||
hipStream_t stream);
|
||||
/*! @endcond */
|
||||
|
||||
/*! @brief All-Reduce-with-Bias
|
||||
@details Reduces data arrays of length *count* in *sendbuff* using *op* operation, and
|
||||
leaves identical copies of result on each *recvbuff*.
|
||||
In-place operation will happen if sendbuff == recvbuff.
|
||||
@return Result code. See @ref rccl_result_code for more details.
|
||||
|
||||
@param[in] sendbuff Input data array to reduce
|
||||
@param[out] recvbuff Data array to store reduced result array
|
||||
@param[in] count Number of elements in data buffer
|
||||
@param[in] datatype Data buffer element datatype
|
||||
@param[in] op Reduction operator
|
||||
@param[in] comm Communicator group object to execute on
|
||||
@param[in] stream HIP stream to execute collective on
|
||||
@param[in] acc Bias data array to reduce */
|
||||
ncclResult_t
|
||||
ncclAllReduceWithBias(const void* sendbuff,
|
||||
void* recvbuff,
|
||||
size_t count,
|
||||
ncclDataType_t datatype,
|
||||
ncclRedOp_t op,
|
||||
ncclComm_t comm,
|
||||
hipStream_t stream,
|
||||
const void* acc);
|
||||
/*! @cond include_hidden */
|
||||
ncclResult_t
|
||||
pncclAllReduceWithBias(const void* sendbuff,
|
||||
void* recvbuff,
|
||||
size_t count,
|
||||
ncclDataType_t datatype,
|
||||
ncclRedOp_t op,
|
||||
ncclComm_t comm,
|
||||
hipStream_t stream,
|
||||
const void* acc);
|
||||
/*! @endcond */
|
||||
|
||||
/*! @brief Reduce-Scatter
|
||||
@details Reduces data in *sendbuff* using *op* operation and leaves reduced result
|
||||
scattered over the devices so that *recvbuff* on rank i will contain the i-th
|
||||
@@ -937,13 +1097,14 @@ pncclAllToAllv(const void* sendbuff,
|
||||
/*! @} */
|
||||
|
||||
/*! @defgroup msccl_api MSCCL Algorithm
|
||||
@brief API calls relating to the optional MSCCL algorithm datapath
|
||||
@details API calls relating to the optional MSCCL algorithm datapath
|
||||
@{ */
|
||||
|
||||
/*! @brief Opaque handle to MSCCL algorithm */
|
||||
typedef int mscclAlgoHandle_t;
|
||||
|
||||
/*! @brief MSCCL Load Algorithm
|
||||
@deprecated This function has been removed from the public API.
|
||||
@details Load MSCCL algorithm file specified in mscclAlgoFilePath and return
|
||||
its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
|
||||
scheduler instead of end users.
|
||||
@@ -960,6 +1121,7 @@ pmscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle
|
||||
/*! @endcond */
|
||||
|
||||
/*! @brief MSCCL Run Algorithm
|
||||
@deprecated This function has been removed from the public API.
|
||||
@details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter
|
||||
list merges all possible parameters required by different operations as this
|
||||
is a general-purposed API. This API is expected to be called by MSCCL
|
||||
@@ -1032,7 +1194,7 @@ pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
|
||||
/*! @} */
|
||||
|
||||
/*! @defgroup rccl_group_api Group semantics
|
||||
@brief When managing multiple GPUs from a single thread, and since RCCL collective
|
||||
@details When managing multiple GPUs from a single thread, and since RCCL collective
|
||||
calls may perform inter-CPU synchronization, we need to "group" calls for
|
||||
different ranks/devices into a single call.
|
||||
|
||||
@@ -1076,6 +1238,16 @@ pncclGroupEnd();
|
||||
/*! @endcond */
|
||||
/*! @} */
|
||||
|
||||
/*
|
||||
* Group Simulate End
|
||||
*
|
||||
* Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct.
|
||||
*/
|
||||
ncclResult_t
|
||||
ncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
|
||||
ncclResult_t
|
||||
pncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // end extern "C"
|
||||
#endif
|
||||
|
||||
@@ -73,12 +73,18 @@ ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommRegister_fn, 35)
|
||||
ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommDeregister_fn, 36)
|
||||
#if RCCL_API_TRACE_VERSION_PATCH >= 1
|
||||
ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclAllReduceWithBias_fn, 37)
|
||||
#elif RCCL_API_TRACE_VERSION_PATCH >= 2
|
||||
ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommShrink_fn, 38)
|
||||
ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommWindowRegister_fn, 39)
|
||||
ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommWindowDeregister_fn, 40)
|
||||
#endif
|
||||
|
||||
#if RCCL_API_TRACE_VERSION_PATCH == 0
|
||||
ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 37)
|
||||
#elif RCCL_API_TRACE_VERSION_PATCH == 1
|
||||
ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 38)
|
||||
#elif RCCL_API_TRACE_VERSION_PATCH == 2
|
||||
ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 41)
|
||||
#else
|
||||
INTERNAL_CI_ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 0)
|
||||
#endif
|
||||
|
||||
@@ -105,6 +105,11 @@ RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_nc
|
||||
#if RCCL_API_TRACE_VERSION_PATCH >= 1
|
||||
RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_ncclAllReduceWithBias, ncclAllReduceWithBias, ncclAllReduceWithBias_fn, sendbuff, recvbuff, count, datatype, op, comm, stream, acc)
|
||||
#endif
|
||||
#if RCCL_API_TRACE_VERSION_PATCH >= 2
|
||||
RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_ncclCommShrink, ncclCommShrink, ncclCommShrink_fn, comm, excludeRanksList, excludeRanksCount, newcomm, config, shrinkFlags)
|
||||
RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_ncclCommWindowRegister, ncclCommWindowRegister, ncclCommWindowRegister_fn, comm, buff, size, win, winFlags)
|
||||
RCCL_API_INFO_DEFINITION_V(ROCPROFILER_RCCL_TABLE_ID, ROCPROFILER_RCCL_API_ID_ncclCommWindowDeregister, ncclCommWindowDeregister, ncclCommWindowDeregister_fn, comm, win)
|
||||
#endif
|
||||
|
||||
#else
|
||||
# error \
|
||||
|
||||
@@ -1160,21 +1160,72 @@ rocprofiler_set_api_table(const char* name,
|
||||
|
||||
auto* rccl_api = static_cast<rcclApiFuncTable*>(tables[0]);
|
||||
|
||||
// any internal modifications to the rcclApiFuncTable need to be done before we make the
|
||||
// copy or else those modifications will be lost when RCCL API tracing is enabled
|
||||
// because the RCCL API tracing invokes the function pointers from the copy below
|
||||
rocprofiler::rccl::copy_table(rccl_api, lib_instance);
|
||||
auto is_valid_rccl_dispatch_table = (rccl_api != nullptr);
|
||||
|
||||
// install rocprofiler API wrappers
|
||||
rocprofiler::rccl::update_table(rccl_api);
|
||||
// Runtime ABI validation for RCCL API dispatch table.
|
||||
//
|
||||
// NOTE: These checks are necessary because rocprofiler-sdk enforces ABI
|
||||
// compatibility at compile time. If RCCL is rebuilt afterwards with an
|
||||
// incorrect or mismatched dispatch table, compile-time checks are bypassed.
|
||||
#if ROCPROFILER_SDK_COMPUTE_VERSION(RCCL_API_TRACE_VERSION_MAJOR, \
|
||||
0, \
|
||||
RCCL_API_TRACE_VERSION_PATCH) >= 1
|
||||
// 1. For RCCL_API_TRACE_VERSION_PATCH = 1, ncclAllReduceWithBias_fn is expected
|
||||
// to be the last entry (38th function) in the dispatch table. Its offset is
|
||||
// therefore used as the canonical end of the table for patch 1.
|
||||
//
|
||||
// Problem: Some intermediate RCCL commits introduced new APIs *before*
|
||||
// ncclAllReduceWithBias_fn without bumping the ABI patch version. That
|
||||
// breaks the ABI contract with rocprofiler-sdk, because the table layout no
|
||||
// longer matches what the SDK was compiled against.
|
||||
//
|
||||
// 2. This check prevents such mismatches at runtime:
|
||||
// a. NCCL_VERSION_CODE < 22703 → indicates the first RCCL build was taken
|
||||
// before the broken commits.
|
||||
// b. rccl_api->size > offsetof(..., ncclAllReduceWithBias_fn) + sizeof(void*)
|
||||
// → indicates the current RCCL dispatch table is larger than expected,
|
||||
// meaning newer (broken) entries were inserted before the known last API.
|
||||
//
|
||||
// If both conditions are true, the dispatch table is invalid and tracing is
|
||||
// disabled to avoid corrupt output.
|
||||
if(is_valid_rccl_dispatch_table && NCCL_VERSION_CODE < 22703 &&
|
||||
rccl_api->size > offsetof(rcclApiFuncTable, ncclAllReduceWithBias_fn) + sizeof(void*))
|
||||
{
|
||||
is_valid_rccl_dispatch_table = false;
|
||||
|
||||
// Tracing notifications the runtime has initialized
|
||||
rocprofiler::runtime_init::initialize(
|
||||
ROCPROFILER_RUNTIME_INITIALIZATION_RCCL, lib_version, lib_instance);
|
||||
ROCP_CI_LOG(WARNING) << fmt::format(
|
||||
"Invalid RCCL dispatch table: layout does not match the expected "
|
||||
"rocprofiler-SDK ABI (RCCL API Trace v{}.{}.{}). "
|
||||
"Tracing is disabled to prevent corrupted data. "
|
||||
"Use a compatible RCCL version.",
|
||||
RCCL_API_TRACE_VERSION_MAJOR,
|
||||
0,
|
||||
RCCL_API_TRACE_VERSION_PATCH);
|
||||
}
|
||||
#endif
|
||||
if(is_valid_rccl_dispatch_table)
|
||||
{
|
||||
// any internal modifications to the rcclApiFuncTable need to be done before we make
|
||||
// the copy or else those modifications will be lost when RCCL API tracing is
|
||||
// enabled because the RCCL API tracing invokes the function pointers from the copy
|
||||
// below
|
||||
rocprofiler::rccl::copy_table(rccl_api, lib_instance);
|
||||
|
||||
// allow tools to install API wrappers
|
||||
rocprofiler::intercept_table::notify_intercept_table_registration(
|
||||
ROCPROFILER_RCCL_TABLE, lib_version, lib_instance, std::make_tuple(rccl_api));
|
||||
// install rocprofiler API wrappers
|
||||
rocprofiler::rccl::update_table(rccl_api);
|
||||
|
||||
// Tracing notifications the runtime has initialized
|
||||
rocprofiler::runtime_init::initialize(
|
||||
ROCPROFILER_RUNTIME_INITIALIZATION_RCCL, lib_version, lib_instance);
|
||||
|
||||
// allow tools to install API wrappers
|
||||
rocprofiler::intercept_table::notify_intercept_table_registration(
|
||||
ROCPROFILER_RCCL_TABLE, lib_version, lib_instance, std::make_tuple(rccl_api));
|
||||
}
|
||||
else
|
||||
{
|
||||
ROCP_CI_LOG(WARNING) << "RCCL API tracing is disabled: dispatch table is invalid.";
|
||||
}
|
||||
}
|
||||
else if(std::string_view{name} == "rocdecode")
|
||||
{
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren