Files
rocm-systems/projects/rccl/src/nccl.h.in
T

1023 baris
54 KiB
C
Mentah Pandangan Normal Riwayat

2018-09-24 16:06:59 -07:00
/*************************************************************************
2021-04-12 16:00:11 -07:00
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
2022-12-13 07:51:04 +08:00
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
2018-09-24 16:06:59 -07:00
*
* See LICENSE.txt for license information
************************************************************************/
2025-11-20 09:35:15 -05:00
2018-09-24 16:06:59 -07:00
#ifndef NCCL_H_
#define NCCL_H_
#include <hip/hip_runtime.h>
2019-07-05 15:43:00 -07:00
#include <hip/hip_fp16.h>
2018-09-24 16:06:59 -07:00
2019-07-05 15:43:00 -07:00
#define NCCL_MAJOR ${NCCL_MAJOR}
#define NCCL_MINOR ${NCCL_MINOR}
#define NCCL_PATCH ${NCCL_PATCH}
#define NCCL_SUFFIX "${NCCL_SUFFIX}"
2018-09-24 16:06:59 -07:00
2019-07-05 15:43:00 -07:00
#define NCCL_VERSION_CODE ${NCCL_VERSION}
2021-09-08 13:56:25 -07:00
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
2018-09-24 16:06:59 -07:00
2019-11-15 10:39:48 -08:00
#define RCCL_BFLOAT16 1
2024-03-09 07:17:53 +08:00
#define RCCL_FLOAT8 1
#define RCCL_GATHER_SCATTER 1
#define RCCL_ALLTOALLV 1
#define RCCL_ALLREDUCE_WITH_BIAS 1
2019-11-15 10:39:48 -08:00
2018-09-24 16:06:59 -07:00
#ifdef __cplusplus
extern "C" {
#endif
2023-02-27 02:48:21 -08:00
#include <limits.h>
2023-07-28 16:09:06 -06:00
/*! @brief Opaque handle to communicator
@details A communicator contains information required to facilitate collective communications calls */
2018-09-24 16:06:59 -07:00
typedef struct ncclComm* ncclComm_t;
2025-09-02 13:21:14 -07:00
typedef struct ncclWindow_vidmem* ncclWindow_t;
2023-02-27 02:48:21 -08:00
#define NCCL_COMM_NULL NULL
2018-09-24 16:06:59 -07:00
#define NCCL_UNIQUE_ID_BYTES 128
2023-07-28 16:09:06 -06:00
/*! @brief Opaque unique id used to initialize communicators
@details The ncclUniqueId must be passed to all participating ranks */
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/} ncclUniqueId;
/*! @defgroup rccl_result_code Result Codes
@details The various result codes that RCCL API calls may return
@{ */
/*! @brief Result type
@details Return codes aside from ncclSuccess indicate that a call has failed */
typedef enum {
ncclSuccess = 0, /*!< No error */
ncclUnhandledCudaError = 1, /*!< Unhandled HIP error */
ncclSystemError = 2, /*!< Unhandled system error */
ncclInternalError = 3, /*!< Internal Error - Please report to RCCL developers */
ncclInvalidArgument = 4, /*!< Invalid argument */
ncclInvalidUsage = 5, /*!< Invalid usage */
ncclRemoteError = 6, /*!< Remote process exited or there was a network error */
ncclInProgress = 7, /*!< RCCL operation in progress */
ncclNumResults = 8 /*!< Number of result types */
} ncclResult_t;
/*! @} */
2022-08-18 02:53:17 -07:00
2023-02-27 02:48:21 -08:00
#define NCCL_CONFIG_UNDEF_INT INT_MIN
#define NCCL_CONFIG_UNDEF_PTR NULL
2023-04-03 05:32:07 -07:00
#define NCCL_SPLIT_NOCOLOR -1
2024-06-11 01:28:01 -07:00
#define NCCL_UNDEF_FLOAT -1.0f
2023-02-27 02:48:21 -08:00
2025-05-29 20:56:40 -07:00
/* Window Registration flags */
#define NCCL_WIN_DEFAULT 0x00
#define NCCL_WIN_COLL_SYMMETRIC 0x01
2025-09-02 13:21:14 -07:00
#define NCCL_WIN_REQUIRED_ALIGNMENT 4096
2025-05-29 20:56:40 -07:00
/* NCCL performance policy */
#define NCCL_CTA_POLICY_DEFAULT 0x00
#define NCCL_CTA_POLICY_EFFICIENCY 0x01
2025-09-02 13:21:14 -07:00
#define NCCL_CTA_POLICY_ZERO 0x02
2025-05-29 20:56:40 -07:00
/* ncclCommShrink flags*/
#define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */
#define NCCL_SHRINK_ABORT 0x01 /* First, terminate ongoing parent operations, and then shrink the parent communicator */
2023-07-28 16:09:06 -06:00
/*! @defgroup rccl_config_type Communicator Configuration
@details Structure that allows for customizing Communicator behavior via ncclCommInitRankConfig
@{ */
/*! @brief Communicator configuration
@details Users can assign value to attributes to specify the behavior of a communicator */
2025-09-02 13:21:14 -07:00
typedef struct ncclConfig_v22800 {
2022-08-18 02:53:17 -07:00
/* attributes that users should never touch. */
2023-07-28 16:09:06 -06:00
size_t size; /*!< Should not be touched */
unsigned int magic; /*!< Should not be touched */
unsigned int version; /*!< Should not be touched */
2022-08-18 02:53:17 -07:00
/* attributes that users are able to customize. */
2023-07-28 16:09:06 -06:00
int blocking; /*!< Whether or not calls should block or not */
int cgaClusterSize; /*!< Cooperative group array cluster size */
int minCTAs; /*!< Minimum number of cooperative thread arrays (blocks) */
int maxCTAs; /*!< Maximum number of cooperative thread arrays (blocks) */
const char *netName; /*!< Force NCCL to use a specfic network */
int splitShare; /*!< Allow communicators to share resources */
int trafficClass; /*!< Traffic class*/
const char *commName; /*!< Name of the communicator*/
int collnetEnable; /*!< Check for collnet enablement*/
int CTAPolicy; /*!< CTA Policy*/
int shrinkShare; /*!< Shrink size*/
int nvlsCTAs; /*!< Number of NVLS cooperative thread arrays (blocks)*/
int nChannelsPerNetPeer; /*!< Number of channels per NET peer*/
int nvlinkCentricSched; /*!< nvlinkCentricSched*/
2022-08-18 02:53:17 -07:00
} ncclConfig_t;
/* Config initializer must be assigned to initialize config structure when it is created.
2023-07-28 16:09:06 -06:00
* Not initialized config will result in an error. */
#define NCCL_CONFIG_INITIALIZER { \
sizeof(ncclConfig_t), /* size */ \
0xcafebeef, /* magic */ \
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
NCCL_CONFIG_UNDEF_INT, /* blocking */ \
NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
NCCL_CONFIG_UNDEF_PTR, /* netName */ \
NCCL_CONFIG_UNDEF_INT, /* splitShare */ \
NCCL_CONFIG_UNDEF_INT, /* trafficClass */ \
NCCL_CONFIG_UNDEF_PTR, /* commName */ \
NCCL_CONFIG_UNDEF_INT, /* collnetEnable */ \
NCCL_CONFIG_UNDEF_INT, /* CTAPolicy */ \
NCCL_CONFIG_UNDEF_INT, /* shrinkShare */ \
NCCL_CONFIG_UNDEF_INT, /* nvlsCTAs */ \
NCCL_CONFIG_UNDEF_INT, /* nChannelsPerNetPeer */ \
NCCL_CONFIG_UNDEF_INT, /* nvlinkCentricSched */ \
2022-08-18 02:53:17 -07:00
}
2023-07-28 16:09:06 -06:00
/*! @} */
2018-09-24 16:06:59 -07:00
2024-06-11 01:28:01 -07:00
/* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
typedef struct ncclSimInfo_v22200 {
size_t size;
unsigned int magic;
unsigned int version;
float estimatedTime;
} ncclSimInfo_t;
/* NCCL_SIM_INFO_INITIALIZER must be assigned to initialize simInfo structure when it is created.
* Not initialized simInfo will result in NCCL error. */
#define NCCL_SIM_INFO_INITIALIZER { \
sizeof(ncclSimInfo_t), /* size */ \
0x74685283, /* magic */ \
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
NCCL_UNDEF_FLOAT /* estimated time */ \
}
2024-02-11 22:32:55 -08:00
/* NCCL malloc and free function for all types of NCCL optimizations
* (e.g. user buffer registration). The actual allocated size might
* be larger than requested due to granularity requirement. */
ncclResult_t ncclMemAlloc(void** ptr, size_t size);
ncclResult_t pncclMemAlloc(void** ptr, size_t size);
ncclResult_t ncclMemFree(void *ptr);
ncclResult_t pncclMemFree(void *ptr);
2023-07-28 16:09:06 -06:00
/*! @defgroup rccl_api_version Version Information
@details API call that returns RCCL version
@{ */
2023-07-28 16:09:06 -06:00
/*! @brief Return the RCCL_VERSION_CODE of RCCL in the supplied integer.
@details This integer is coded with the MAJOR, MINOR and PATCH level of RCCL.
@return Result code. See @ref rccl_result_code for more details.
2023-07-28 16:09:06 -06:00
@param[out] version Pointer to where version will be stored */
2018-09-24 16:06:59 -07:00
2023-07-28 16:09:06 -06:00
ncclResult_t ncclGetVersion(int *version);
/*! @cond include_hidden */
ncclResult_t pncclGetVersion(int *version);
/*! @endcond */
/*! @} */
/*! @defgroup rccl_api_communicator Communicator Initialization/Destruction
@details API calls that operate on communicators.
Communicators objects are used to launch collective communication
operations. Unique ranks between 0 and N-1 must be assigned to
each HIP device participating in the same Communicator.
Using the same HIP device for multiple ranks of the same Communicator
is not supported at this time.
@{ */
/*! @brief Generates an ID for ncclCommInitRank.
@details Generates an ID to be used in ncclCommInitRank.
ncclGetUniqueId should be called once by a single rank and the
ID should be distributed to all ranks in the communicator before
using it as a parameter for ncclCommInitRank.
@return Result code. See @ref rccl_result_code for more details.
@param[out] uniqueId Pointer to where uniqueId will be stored */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief Create a new communicator with config.
@details Create a new communicator (multi thread/process version) with a configuration
set by users. See @ref rccl_config_type for more details.
Each rank is associated to a CUDA device, which has to be set before calling
ncclCommInitRank.
@return Result code. See @ref rccl_result_code for more details.
@param[out] comm Pointer to created communicator
@param[in] nranks Total number of ranks participating in this communicator
@param[in] commId UniqueId required for initialization
@param[in] rank Current rank to create communicator for. [0 to nranks-1]
@param[in] config Pointer to communicator configuration */
2022-08-18 02:53:17 -07:00
ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2022-08-18 02:53:17 -07:00
ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief Creates a new communicator (multi thread/process version).
@details Rank must be between 0 and nranks-1 and unique within a communicator clique.
Each rank is associated to a CUDA device, which has to be set before calling
ncclCommInitRank. ncclCommInitRank implicitly syncronizes with other ranks,
so it must be called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
@return Result code. See @ref rccl_result_code for more details.
@param[out] comm Pointer to created communicator
@param[in] nranks Total number of ranks participating in this communicator
@param[in] commId UniqueId required for initialization
@param[in] rank Current rank to create communicator for */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief Creates a clique of communicators (single process version).
@details This is a convenience function to create a single-process communicator clique.
Returns an array of ndev newly initialized communicators in comm.
comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
If devlist is NULL, the first ndev HIP devices are used.
Order of devlist defines user-order of processors within the communicator.
@return Result code. See @ref rccl_result_code for more details.
@param[out] comm Pointer to array of created communicators
@param[in] ndev Total number of ranks participating in this communicator
@param[in] devlist Array of GPU device indices to create for */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief Finalize a communicator.
@details ncclCommFinalize flushes all issued communications
and marks communicator state as ncclInProgress. The state will change to ncclSuccess
when the communicator is globally quiescent and related resources are freed; then,
calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
itself) without blocking.
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Communicator to finalize */
2022-08-18 02:53:17 -07:00
ncclResult_t ncclCommFinalize(ncclComm_t comm);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2022-08-18 02:53:17 -07:00
ncclResult_t pncclCommFinalize(ncclComm_t comm);
2023-07-28 16:09:06 -06:00
/*! @endcond */
2023-07-28 16:09:06 -06:00
/*! @brief Frees local resources associated with communicator object.
@details Destroy all local resources associated with the passed in communicator object
@return Result code. See @ref rccl_result_code for more details.
2022-08-18 02:53:17 -07:00
2023-07-28 16:09:06 -06:00
@param[in] comm Communicator to destroy */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclCommDestroy(ncclComm_t comm);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclCommDestroy(ncclComm_t comm);
2023-07-28 16:09:06 -06:00
/*! @endcond */
2018-09-24 16:06:59 -07:00
2023-07-28 16:09:06 -06:00
/*! @brief Abort any in-progress calls and destroy the communicator object.
@details Frees resources associated with communicator object and aborts any operations
that might still be running on the device.
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Communicator to abort and destroy */
2018-12-13 15:56:12 -08:00
ncclResult_t ncclCommAbort(ncclComm_t comm);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-12-13 15:56:12 -08:00
ncclResult_t pncclCommAbort(ncclComm_t comm);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief Create one or more communicators from an existing one.
@details Creates one or more communicators from an existing one.
Ranks with the same color will end up in the same communicator.
Within the new communicator, key will be used to order ranks.
NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
and will therefore return a NULL communicator.
If config is NULL, the new communicator will inherit the original communicator's configuration
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Original communicator object for this rank
@param[in] color Color to assign this rank
@param[in] key Key used to order ranks within the same new communicator
@param[out] newcomm Pointer to new communicator
@param[in] config Config file for new communicator. May be NULL to inherit from comm */
2023-04-03 05:32:07 -07:00
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2023-04-03 05:32:07 -07:00
ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @} */
/*! @brief Shrink existing communicator.
@details Ranks in excludeRanksList will be removed form the existing communicator.
Within the new communicator, ranks will be re-ordered to fill the gap of removed ones.
If config is NULL, the new communicator will inherit the original communicator's configuration.
The flag enables NCCL to adapt to various states of the parent communicator, see NCCL_SHRINK flags.
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Original communicator object for this rank
@param[in] excludeRanksList List of ranks to be exluded
@param[in] excludeRanksCount Number of ranks to be excluded
@param[out] newcomm Pointer to new communicator
@param[in] config Config file for new communicator. May be NULL to inherit from comm
@param[in] shrinkFlags Flag to adapt to various states of the parent communicator (see NCCL_SHRINK flags)*/
2025-05-29 20:56:40 -07:00
ncclResult_t ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
ncclResult_t pncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
/*! @brief Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
@details Allows to use more than one ncclUniqueId (up to one per rank),
indicated by nId, to accelerate the init operation.
The number of ncclUniqueIds and their order must be the same for every rank.
@return Result code. See @ref rccl_result_code for more details.
@param[out] newcomm Pointer to new communicator
@param[in] nranks Total number of ranks participating in this communicator
@param[in] myrank Current rank
@param[in] nId Number of unique IDs
@param[in] commIds List of unique IDs
@param[in] config Config file for new communicator. May be NULL to inherit from comm */
2024-09-10 05:57:10 -07:00
ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
/*! @cond include_hidden */
2024-09-10 05:57:10 -07:00
ncclResult_t pncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
/*! @endcond */
2023-07-28 16:09:06 -06:00
/*! @defgroup rccl_api_errcheck Error Checking Calls
@details API calls that check for errors
@{ */
/*! @brief Returns a string for each result code.
@details Returns a human-readable string describing the given result code.
@return String containing description of result code.
2023-04-03 05:32:07 -07:00
2023-07-28 16:09:06 -06:00
@param[in] result Result code to get description for */
2022-05-24 02:02:31 -07:00
/* Returns a string for each error code. */
2018-09-24 16:06:59 -07:00
const char* ncclGetErrorString(ncclResult_t result);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
const char* pncclGetErrorString(ncclResult_t result);
2023-07-28 16:09:06 -06:00
/*! @endcond */
2018-09-24 16:06:59 -07:00
2024-02-05 05:06:02 -08:00
/* Returns a human-readable message of the last error that occurred. */
2022-05-24 02:02:31 -07:00
const char* ncclGetLastError(ncclComm_t comm);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2022-11-29 04:27:46 -08:00
const char* pncclGetLastError(ncclComm_t comm);
2023-07-28 16:09:06 -06:00
/*! @endcond */
2024-12-18 08:26:06 -08:00
/* Reload environment variables that determine logging. */
2025-09-02 13:21:14 -07:00
__attribute__ ((deprecated("ncclResetDebugInit is not supported as part of the NCCL API and will be removed in the future")))
2024-12-18 08:26:06 -08:00
void ncclResetDebugInit();
/*! @cond include_hidden */
2024-12-18 08:26:06 -08:00
void pncclResetDebugInit();
/*! @endcond */
2023-07-28 16:09:06 -06:00
/*! @brief Checks whether the comm has encountered any asynchronous errors
@details Query whether the provided communicator has encountered any asynchronous errors
@return Result code. See @ref rccl_result_code for more details.
2018-09-24 16:06:59 -07:00
2023-07-28 16:09:06 -06:00
@param[in] comm Communicator to query
@param[out] asyncError Pointer to where result code will be stored */
2018-12-13 15:56:12 -08:00
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-12-13 15:56:12 -08:00
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @} */
2018-12-13 15:56:12 -08:00
2023-07-28 16:09:06 -06:00
/*! @defgroup rccl_api_comminfo Communicator Information
@details API calls that query communicator information
@{ */
/*! @brief Gets the number of ranks in the communicator clique.
@details Returns the number of ranks in the communicator clique (as set during initialization)
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Communicator to query
@param[out] count Pointer to where number of ranks will be stored */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
2023-07-28 16:09:06 -06:00
/*~ @endcond */
/*! @brief Get the ROCm device index associated with a communicator
@details Returns the ROCm device number associated with the provided communicator.
@return Result code. See @ref rccl_result_code for more details.
2018-09-24 16:06:59 -07:00
2023-07-28 16:09:06 -06:00
@param[in] comm Communicator to query
@param[out] device Pointer to where the associated ROCm device index will be stored */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
2023-07-28 16:09:06 -06:00
/*! @endcond */
2018-09-24 16:06:59 -07:00
2023-07-28 16:09:06 -06:00
/*! @brief Get the rank associated with a communicator
@details Returns the user-ordered "rank" associated with the provided communicator.
@return Result code. See @ref rccl_result_code for more details.
@param[in] comm Communicator to query
@param[out] rank Pointer to where the associated rank will be stored */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @} */
2024-02-05 05:06:02 -08:00
/* Register CUDA buffer for zero-copy operation */
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
/*! @cond include_hidden */
2024-02-05 05:06:02 -08:00
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
/*! @endcond */
2024-02-05 05:06:02 -08:00
/* Deregister CUDA buffer */
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
/*! @cond include_hidden */
2024-02-05 05:06:02 -08:00
ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
/*! @endcond */
2024-02-05 05:06:02 -08:00
2025-05-29 20:56:40 -07:00
/* Register memory window */
ncclResult_t ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
/*! @cond include_hidden */
2025-05-29 20:56:40 -07:00
ncclResult_t pncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
/*! @endcond */
2025-05-29 20:56:40 -07:00
/* Deregister symmetric memory */
ncclResult_t ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
/*! @cond include_hidden */
2025-05-29 20:56:40 -07:00
ncclResult_t pncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
/*! @endcond */
2023-07-28 16:09:06 -06:00
/*! @defgroup rccl_api_enumerations API Enumerations
@details Enumerations used by collective communication calls
@{ */
2018-09-24 16:06:59 -07:00
2023-07-28 16:09:06 -06:00
/*! @brief Dummy reduction enumeration
@details Dummy reduction enumeration used to determine value for ncclMaxRedOp */
2021-09-08 13:56:25 -07:00
typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
2023-07-28 16:09:06 -06:00
/*! @brief Reduction operation selector
@details Enumeration used to specify the various reduction operations
ncclNumOps is the number of built-in ncclRedOp_t values and serves as
the least possible value for dynamic ncclRedOp_t values constructed by
ncclRedOpCreate functions.
ncclMaxRedOp is the largest valid value for ncclRedOp_t and is defined
to be the largest signed value (since compilers are permitted to use
signed enums) that won't grow sizeof(ncclRedOp_t) when compared to previous
RCCL versions to maintain ABI compatibility. */
typedef enum { ncclSum = 0, /*!< Sum */
ncclProd = 1, /*!< Product */
ncclMax = 2, /*!< Max */
ncclMin = 3, /*!< Min */
ncclAvg = 4, /*!< Average */
ncclNumOps = 5, /*!< Number of built-in reduction ops */
ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) /*!< Largest value for ncclRedOp_t */
2021-09-08 13:56:25 -07:00
} ncclRedOp_t;
2018-09-24 16:06:59 -07:00
2023-07-28 16:09:06 -06:00
/*! @brief Data types
@details Enumeration of the various supported datatype */
2018-09-24 16:06:59 -07:00
typedef enum { ncclInt8 = 0, ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2, ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6, ncclHalf = 6,
ncclFloat32 = 7, ncclFloat = 7,
ncclFloat64 = 8, ncclDouble = 8,
2019-11-15 10:39:48 -08:00
ncclBfloat16 = 9,
2024-12-18 08:26:06 -08:00
ncclFloat8e4m3 = 10,
ncclFloat8e5m2 = 11,
ncclNumTypes = 12
2021-07-08 14:12:04 -07:00
} ncclDataType_t;
2023-07-28 16:09:06 -06:00
/*! @} */
/*! @defgroup rccl_api_custom_redop Custom Reduction Operator
@details API calls relating to creation/destroying custom reduction operator
that pre-multiplies local source arrays prior to reduction
@{ */
/*! @brief Location and dereferencing logic for scalar arguments.
@details Enumeration specifying memory location of the scalar argument.
Based on where the value is stored, the argument will be dereferenced either
while the collective is running (if in device memory), or before the ncclRedOpCreate()
function returns (if in host memory). */
2021-09-08 13:56:25 -07:00
typedef enum {
2023-07-28 16:09:06 -06:00
ncclScalarDevice = 0, /*!< Scalar is in device-visible memory */
ncclScalarHostImmediate = 1 /*!< Scalar is in host-visible memory */
2021-09-08 13:56:25 -07:00
} ncclScalarResidence_t;
2023-07-28 16:09:06 -06:00
/*! @brief Create a custom pre-multiplier reduction operator
@details Creates a new reduction operator which pre-multiplies input values by a given
scalar locally before reducing them with peer values via summation. For use
only with collectives launched against *comm* and *datatype*. The
*residence* argument indicates how/when the memory pointed to by *scalar*
will be dereferenced. Upon return, the newly created operator's handle
is stored in *op*.
@return Result code. See @ref rccl_result_code for more details.
@param[out] op Pointer to where newly created custom reduction operator is to be stored
@param[in] scalar Pointer to scalar value.
@param[in] datatype Scalar value datatype
@param[in] residence Memory type of the scalar value
@param[in] comm Communicator to associate with this custom reduction operator */
2021-09-08 13:56:25 -07:00
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2021-09-08 13:56:25 -07:00
ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief Destroy custom reduction operator
@details Destroys the reduction operator *op*. The operator must have been created by
ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
destroyed as soon as the last RCCL function which is given that operator returns.
@return Result code. See @ref rccl_result_code for more details.
2021-09-08 13:56:25 -07:00
2023-07-28 16:09:06 -06:00
@param[in] op Custom reduction operator is to be destroyed
@param[in] comm Communicator associated with this reduction operator */
2021-09-08 13:56:25 -07:00
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2021-09-08 13:56:25 -07:00
ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @} */
/*! @defgroup rccl_collective_api Collective Communication Operations
@details Collective communication operations must be called separately for each
communicator in a communicator clique.
They return when operations have been enqueued on the HIP stream.
Since they may perform inter-CPU synchronization, each call has to be done
from a different thread or process, or need to use Group Semantics (see
below).
@{ */
/*! @brief Reduce
@details Reduces data arrays of length *count* in *sendbuff* into *recvbuff* using *op*
operation.
*recvbuff* may be NULL on all calls except for root device.
*root* is the rank (not the HIP device) where data will reside after the
operation is complete.
In-place operation will happen if sendbuff == recvbuff.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Local device data buffer to be reduced
@param[out] recvbuff Data buffer where result is stored (only for *root* rank). May be null for other ranks.
@param[in] count Number of elements in every send buffer
@param[in] datatype Data buffer element datatype
@param[in] op Reduction operator type
@param[in] root Rank where result data array will be stored
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
2019-07-05 15:43:00 -07:00
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
2019-07-05 15:43:00 -07:00
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief (Deprecated) Broadcast (in-place)
@details Copies *count* values from *root* to all other devices.
root is the rank (not the CUDA device) where data resides before the
operation is started.
This operation is implicitly in-place.
@return Result code. See @ref rccl_result_code for more details.
@param[in,out] buff Input array on *root* to be copied to other ranks. Output array for all ranks.
@param[in] count Number of elements in data buffer
@param[in] datatype Data buffer element datatype
@param[in] root Rank owning buffer to be copied to others
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
2019-07-05 15:43:00 -07:00
ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
2019-07-05 15:43:00 -07:00
ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief Broadcast
@details Copies *count* values from *sendbuff* on *root* to *recvbuff* on all devices.
*root* is the rank (not the HIP device) where data resides before the operation is started.
*sendbuff* may be NULL on ranks other than *root*.
In-place operation will happen if *sendbuff* == *recvbuff*.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to copy (if *root*). May be NULL for other ranks
@param[in] recvbuff Data array to store received array
@param[in] count Number of elements in data buffer
@param[in] datatype Data buffer element datatype
@param[in] root Rank of broadcast root
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
2019-07-05 15:43:00 -07:00
ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
2019-07-05 15:43:00 -07:00
ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief All-Reduce
@details Reduces data arrays of length *count* in *sendbuff* using *op* operation, and
leaves identical copies of result on each *recvbuff*.
In-place operation will happen if sendbuff == recvbuff.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Input data array to reduce
@param[out] recvbuff Data array to store reduced result array
@param[in] count Number of elements in data buffer
@param[in] datatype Data buffer element datatype
@param[in] op Reduction operator
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
2019-07-05 15:43:00 -07:00
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
2019-07-05 15:43:00 -07:00
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief All-Reduce-with-Bias
@details Reduces data arrays of length *count* in *sendbuff* using *op* operation, and
leaves identical copies of result on each *recvbuff*.
In-place operation will happen if sendbuff == recvbuff.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Input data array to reduce
@param[out] recvbuff Data array to store reduced result array
@param[in] count Number of elements in data buffer
@param[in] datatype Data buffer element datatype
@param[in] op Reduction operator
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on
@param[in] acc Bias data array to reduce */
ncclResult_t ncclAllReduceWithBias(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream, const void* acc);
/*! @cond include_hidden */
ncclResult_t pncclAllReduceWithBias(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream, const void* acc);
/*! @endcond */
2023-07-28 16:09:06 -06:00
/*! @brief Reduce-Scatter
@details Reduces data in *sendbuff* using *op* operation and leaves reduced result
scattered over the devices so that *recvbuff* on rank i will contain the i-th
block of the result.
Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff*
should have a size of at least nranks*recvcount elements.
In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Input data array to reduce
@param[out] recvbuff Data array to store reduced result subarray
@param[in] recvcount Number of elements each rank receives
@param[in] datatype Data buffer element datatype
@param[in] op Reduction operator
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
2019-07-05 15:43:00 -07:00
hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
2019-07-05 15:43:00 -07:00
hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief All-Gather
@details Each device gathers *sendcount* values from other GPUs into *recvbuff*,
receiving data from rank i at offset i*sendcount.
Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
should have a size of at least nranks*sendcount elements.
In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Input data array to send
@param[out] recvbuff Data array to store the gathered result
@param[in] sendcount Number of elements each rank sends
@param[in] datatype Data buffer element datatype
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
2018-09-24 16:06:59 -07:00
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
2019-07-05 15:43:00 -07:00
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2018-09-24 16:06:59 -07:00
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
2019-07-05 15:43:00 -07:00
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief All-to-All
@details Each device sends count values to all other devices and receives count values
from all other devices. Data to send to destination rank j is taken from
sendbuff+j*count and data received from source rank i is placed at
recvbuff+i*count.
2023-07-28 16:09:06 -06:00
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to send (contains blocks for each other rank)
@param[out] recvbuff Data array to receive (contains blocks from each other rank)
@param[in] count Number of elements to send between each pair of ranks
2023-07-28 16:09:06 -06:00
@param[in] datatype Data buffer element datatype
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
2025-09-02 13:21:14 -07:00
ncclResult_t ncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2025-09-02 13:21:14 -07:00
ncclResult_t pncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief All-To-Allv
@details Device (i) sends sendcounts[j] of data from offset sdispls[j]
to device (j). At the same time, device (i) receives recvcounts[j] of data
from device (j) to be placed at rdispls[j].
sendcounts, sdispls, recvcounts and rdispls are all measured in the units
of datatype, not bytes.
In-place operation will happen if sendbuff == recvbuff.
2023-07-28 16:09:06 -06:00
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to send (contains blocks for each other rank)
@param[in] sendcounts Array containing number of elements to send to each participating rank
@param[in] sdispls Array of offsets into *sendbuff* for each participating rank
@param[out] recvbuff Data array to receive (contains blocks from each other rank)
@param[in] recvcounts Array containing number of elements to receive from each participating rank
@param[in] rdispls Array of offsets into *recvbuff* for each participating rank
2023-07-28 16:09:06 -06:00
@param[in] datatype Data buffer element datatype
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t ncclAlltoAllv(const void *sendbuff, const size_t sendcounts[],
const size_t sdispls[], void *recvbuff, const size_t recvcounts[],
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
ncclResult_t pncclAlltoAllv(const void *sendbuff, const size_t sendcounts[],
const size_t sdispls[], void *recvbuff, const size_t recvcounts[],
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief Gather
@details Each rank sends count elements from sendbuff to the root rank.
On the root rank, data from rank i is placed at recvbuff + i*count.
On non-root ranks, recvbuff is not used.
root is the rank where data will be gathered.
In-place operations will happen if sendbuff == recvbuff + root * count.
2023-07-28 16:09:06 -06:00
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to send
@param[in] recvbuff Data array to recv
@param[in] count Number of elements
2023-07-28 16:09:06 -06:00
@param[in] datatype Data buffer element datatype
@param[in] root Rank of gather root
2023-07-28 16:09:06 -06:00
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
2025-09-02 13:21:14 -07:00
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2025-09-02 13:21:14 -07:00
ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief Scatter
@details On the root rank, count elements from sendbuff+i*count are sent to rank i.
On non-root ranks, sendbuff is not used.
Each rank receives count elements into recvbuff.
root is the rank that will distribute the data.
In-place operations will happen if recvbuff == sendbuff + root * count.
2023-07-28 16:09:06 -06:00
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to send
@param[in] recvbuff Data array to recv
@param[in] count Number of elements
2023-07-28 16:09:06 -06:00
@param[in] datatype Data buffer element datatype
@param[in] root Rank of scatter root
2023-07-28 16:09:06 -06:00
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
2025-09-02 13:21:14 -07:00
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2025-09-02 13:21:14 -07:00
ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief All-To-All
@details Device (i) send (j)th block of data to device (j) and be placed as (i)th
block. Each block for sending/receiving has *count* elements, which means
that *recvbuff* and *sendbuff* should have a size of nranks*count elements.
2023-09-22 09:37:30 -06:00
In-place operation is NOT supported. It is the user's responsibility
to ensure that sendbuff and recvbuff are distinct.
@deprecated ncclAllToAll is replaced with ncclAlltoAll and will be removed in the future.
2023-07-28 16:09:06 -06:00
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to send (contains blocks for each other rank)
@param[out] recvbuff Data array to receive (contains blocks from each other rank)
@param[in] count Number of elements to send between each pair of ranks
@param[in] datatype Data buffer element datatype
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream)
__attribute__ ((deprecated("ncclAllToAll is replaced with ncclAlltoAll and will be removed in the future")));
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief All-To-Allv
@details Device (i) sends sendcounts[j] of data from offset sdispls[j]
to device (j). At the same time, device (i) receives recvcounts[j] of data
from device (j) to be placed at rdispls[j].
sendcounts, sdispls, recvcounts and rdispls are all measured in the units
of datatype, not bytes.
In-place operation will happen if sendbuff == recvbuff.
@deprecated ncclAllToAllv is replaced with ncclAlltoAllv and will be removed in the future.
2023-07-28 16:09:06 -06:00
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to send (contains blocks for each other rank)
@param[in] sendcounts Array containing number of elements to send to each participating rank
@param[in] sdispls Array of offsets into *sendbuff* for each participating rank
@param[out] recvbuff Data array to receive (contains blocks from each other rank)
@param[in] recvcounts Array containing number of elements to receive from each participating rank
@param[in] rdispls Array of offsets into *recvbuff* for each participating rank
@param[in] datatype Data buffer element datatype
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
const size_t sdispls[], void *recvbuff, const size_t recvcounts[],
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream)
__attribute__ ((deprecated("ncclAllToAllv is replaced with ncclAlltoAllv and will be removed in the future")));
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
ncclResult_t pncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
const size_t sdispls[], void *recvbuff, const size_t recvcounts[],
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief Send
@details Send data from *sendbuff* to rank *peer*.
Rank *peer* needs to call ncclRecv with the same *datatype* and the same *count*
as this rank.
This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
need to progress concurrently to complete, they must be fused within a ncclGroupStart /
ncclGroupEnd section.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendbuff Data array to send
@param[in] count Number of elements to send
@param[in] datatype Data buffer element datatype
@param[in] peer Peer rank to send to
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
2020-05-12 14:40:18 -07:00
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream);
/*! @cond include_hidden */
2020-05-12 14:40:18 -07:00
ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream);
/*! @endcond */
/*! @brief Receive
@details Receive data from rank *peer* into *recvbuff*.
Rank *peer* needs to call ncclSend with the same datatype and the same count
as this rank.
This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
need to progress concurrently to complete, they must be fused within a ncclGroupStart/
ncclGroupEnd section.
@return Result code. See @ref rccl_result_code for more details.
@param[out] recvbuff Data array to receive
@param[in] count Number of elements to receive
@param[in] datatype Data buffer element datatype
@param[in] peer Peer rank to send to
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
2020-05-12 14:40:18 -07:00
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream);
/*! @cond include_hidden */
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream);
/*! @endcond */
2023-07-28 16:09:06 -06:00
/*! @} */
/*! @defgroup msccl_api MSCCL Algorithm
@details API calls relating to the optional MSCCL algorithm datapath
@{ */
/*! @brief Opaque handle to MSCCL algorithm */
2022-12-13 07:51:04 +08:00
typedef int mscclAlgoHandle_t;
2023-07-28 16:09:06 -06:00
/*! @brief MSCCL Load Algorithm
2025-06-11 17:52:09 -06:00
@deprecated This function has been removed from the public API.
2023-07-28 16:09:06 -06:00
@details Load MSCCL algorithm file specified in mscclAlgoFilePath and return
its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
scheduler instead of end users.
@return Result code. See @ref rccl_result_code for more details.
@param[in] mscclAlgoFilePath Path to MSCCL algorithm file
@param[out] mscclAlgoHandle Returned handle to MSCCL algorithm
2025-06-11 17:52:09 -06:00
@param[in] rank Current rank */
ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2025-06-11 17:52:09 -06:00
ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @brief MSCCL Run Algorithm
2025-06-11 17:52:09 -06:00
@deprecated This function has been removed from the public API.
2023-07-28 16:09:06 -06:00
@details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter
list merges all possible parameters required by different operations as this
is a general-purposed API. This API is expected to be called by MSCCL
scheduler instead of end users.
@return Result code. See @ref rccl_result_code for more details.
@param[in] sendBuff Data array to send
@param[in] sendCounts Array containing number of elements to send to each participating rank
@param[in] sDisPls Array of offsets into *sendbuff* for each participating rank
@param[out] recvBuff Data array to receive
@param[in] recvCounts Array containing number of elements to receive from each participating rank
@param[in] rDisPls Array of offsets into *recvbuff* for each participating rank
@param[in] count Number of elements
@param[in] dataType Data buffer element datatype
@param[in] root Root rank index
@param[in] peer Peer rank index
@param[in] op Reduction operator
@param[in] mscclAlgoHandle Handle to MSCCL algorithm
@param[in] comm Communicator group object to execute on
@param[in] stream HIP stream to execute collective on */
2022-12-13 07:51:04 +08:00
ncclResult_t mscclRunAlgo(
const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2022-12-13 07:51:04 +08:00
ncclResult_t pmscclRunAlgo(
const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
2023-07-28 16:09:06 -06:00
/*! @endcond */
2022-12-13 07:51:04 +08:00
2023-07-28 16:09:06 -06:00
/*! @brief MSCCL Unload Algorithm
2024-07-04 09:34:38 -06:00
@deprecated This function has been removed from the public API.
2023-07-28 16:09:06 -06:00
@details Unload MSCCL algorithm previous loaded using its handle. This API
is expected to be called by MSCCL scheduler instead of end users.
@return Result code. See @ref rccl_result_code for more details.
@param[in] mscclAlgoHandle Handle to MSCCL algorithm to unload
*/
2022-12-13 07:51:04 +08:00
ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2022-12-13 07:51:04 +08:00
ncclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @} */
/*! @defgroup rccl_group_api Group semantics
@details When managing multiple GPUs from a single thread, and since RCCL collective
calls may perform inter-CPU synchronization, we need to "group" calls for
different ranks/devices into a single call.
Grouping RCCL calls as being part of the same collective operation is done
using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
collective calls until the ncclGroupEnd call, which will wait for all calls
to be complete. Note that for collective communication, ncclGroupEnd only
guarantees that the operations are enqueued on the streams, not that
the operation is effectively done.
Both collective communication and ncclCommInitRank can be used in conjunction
of ncclGroupStart/ncclGroupEnd, but not together.
Group semantics also allow to fuse multiple operations on the same device
to improve performance (for aggregated collective calls), or to permit
concurrent progress of multiple send/receive operations.
@{ */
/*! @brief Group Start
@details Start a group call. All calls to RCCL until ncclGroupEnd will be fused into
a single RCCL operation. Nothing will be started on the HIP stream until
ncclGroupEnd.
@return Result code. See @ref rccl_result_code for more details. */
2019-11-19 14:57:39 -08:00
ncclResult_t ncclGroupStart();
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2019-11-19 14:57:39 -08:00
ncclResult_t pncclGroupStart();
2023-07-28 16:09:06 -06:00
/*! @endcond */
2018-09-24 16:06:59 -07:00
2023-07-28 16:09:06 -06:00
/*! @brief Group End
@details End a group call. Start a fused RCCL operation consisting of all calls since
ncclGroupStart. Operations on the HIP stream depending on the RCCL operations
need to be called after ncclGroupEnd.
@return Result code. See @ref rccl_result_code for more details. */
2019-11-19 14:57:39 -08:00
ncclResult_t ncclGroupEnd();
2023-07-28 16:09:06 -06:00
/*! @cond include_hidden */
2019-11-19 14:57:39 -08:00
ncclResult_t pncclGroupEnd();
2023-07-28 16:09:06 -06:00
/*! @endcond */
/*! @} */
2018-09-24 16:06:59 -07:00
2024-06-11 01:28:01 -07:00
/*
* Group Simulate End
*
* Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct.
*/
ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
ncclResult_t pncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
2018-09-24 16:06:59 -07:00
#ifdef __cplusplus
} // end extern "C"
#endif
2025-12-11 17:02:35 -05:00
#ifdef __cplusplus
#define NCCL_COMM_DUMP
#include <unordered_map>
#include <string>
/* Dump NCCL current internal state for a given communicator in a key-value store format.
* define outside extern "C"{} to pass C++ template */
ncclResult_t ncclCommDump(ncclComm_t comm, std::unordered_map<std::string, std::string>& map);
#else
#pragma message "NCCL C++ API is disabled because C compiler is being used. Please use a C++ compiler to build NCCL."
2025-12-11 17:02:35 -05:00
#endif
2025-06-11 17:52:09 -06:00
#endif // end include guard