Files
2026-01-20 13:04:02 -06:00

358 sor
14 KiB
C
Executable File

/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_H_
#define NCCL_H_
#include <hip/hip_runtime_api.h>
#include <hip/hip_fp16.h>
#define NCCL_MAJOR 2
#define NCCL_MINOR 7
#define NCCL_PATCH 0
#define NCCL_SUFFIX ""
#define NCCL_VERSION_CODE 2700
#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
#define RCCL_BFLOAT16 1
#define RCCL_BFLOAT8 1
#define RCCL_GATHER_SCATTER 1
#ifdef __cplusplus
extern "C" {
#endif
/* Opaque handle to communicator */
typedef struct ncclComm* ncclComm_t;
#define NCCL_UNIQUE_ID_BYTES 128
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
/* Error type */
typedef enum { ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclNumResults = 6 } ncclResult_t;
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
* This integer is coded with the MAJOR, MINOR and PATCH level of the
* NCCL library
*/
ncclResult_t ncclGetVersion(int *version);
ncclResult_t pncclGetVersion(int *version);
/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
* called once and the Id should be distributed to all ranks in the
* communicator before calling ncclCommInitRank. */
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
/* Creates a new communicator (multi thread/process version).
* rank must be between 0 and nranks-1 and unique within a communicator clique.
* Each rank is associated to a CUDA device, which has to be set before calling
* ncclCommInitRank.
* ncclCommInitRank implicitly syncronizes with other ranks, so it must be
* called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
/* Creates a clique of communicators (single process version).
* This is a convenience function to create a single-process communicator clique.
* Returns an array of ndev newly initialized communicators in comm.
* comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
* If devlist is NULL, the first ndev CUDA devices are used.
* Order of devlist defines user-order of processors within the communicator. */
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
/* Frees resources associated with communicator object, but waits for any operations
* that might still be running on the device. */
ncclResult_t ncclCommDestroy(ncclComm_t comm);
ncclResult_t pncclCommDestroy(ncclComm_t comm);
/* Frees resources associated with communicator object and aborts any operations
* that might still be running on the device. */
ncclResult_t ncclCommAbort(ncclComm_t comm);
ncclResult_t pncclCommAbort(ncclComm_t comm);
/* Returns a human-readable error message. */
const char* ncclGetErrorString(ncclResult_t result);
const char* pncclGetErrorString(ncclResult_t result);
/* Checks whether the comm has encountered any asynchronous errors */
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
/* Gets the number of ranks in the communicator clique. */
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
/* Returns the cuda device number associated with the communicator. */
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
/* Returns the user-ordered "rank" associated with the communicator. */
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
/* Reduction operation selector */
typedef enum { ncclSum = 0,
ncclProd = 1,
ncclMax = 2,
ncclMin = 3,
ncclNumOps = 4 } ncclRedOp_t;
/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2, ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6, ncclHalf = 6,
ncclFloat32 = 7, ncclFloat = 7,
ncclFloat64 = 8, ncclDouble = 8,
ncclBfloat16 = 9,
ncclFloat8e4m3 = 10,
ncclFloat8e5m2 = 11,
ncclNumTypes = 12 } ncclDataType_t;
/*
* Collective communication operations
*
* Collective communication operations must be called separately for each
* communicator in a communicator clique.
*
* They return when operations have been enqueued on the CUDA stream.
*
* Since they may perform inter-CPU synchronization, each call has to be done
* from a different thread or process, or need to use Group Semantics (see
* below).
*/
/*
* Reduce
*
* Reduces data arrays of length count in sendbuff into recvbuff using op
* operation.
* recvbuff may be NULL on all calls except for root device.
* root is the rank (not the CUDA device) where data will reside after the
* operation is complete.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
/*
* (deprecated) Broadcast (in-place)
*
* Copies count values from root to all other devices.
* root is the rank (not the CUDA device) where data resides before the
* operation is started.
*
* This operation is implicitely in place.
*/
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
/*
* Broadcast
*
* Copies count values from root to all other devices.
* root is the rank (not the CUDA device) where data resides before the
* operation is started.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
/*
* All-Reduce
*
* Reduces data arrays of length count in sendbuff using op operation, and
* leaves identical copies of result on each recvbuff.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
/*
* Reduce-Scatter
*
* Reduces data in sendbuff using op operation and leaves reduced result
* scattered over the devices so that recvbuff on rank i will contain the i-th
* block of the result.
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
* should have a size of at least nranks*recvcount elements.
*
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
*/
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
hipStream_t stream);
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
hipStream_t stream);
/*
* All-Gather
*
* Each device gathers sendcount values from other GPUs into recvbuff,
* receiving data from rank i at offset i*sendcount.
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
* should have a size of at least nranks*sendcount elements.
*
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
*/
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
/*
* Send
*
* Send data from sendbuff to rank peer.
*
* Rank peer needs to call ncclRecv with the same datatype and the same count from this
* rank.
*
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
* ncclGroupEnd section.
*/
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream);
/*
* Receive
*
* Receive data from rank peer into recvbuff.
*
* Rank peer needs to call ncclSend with the same datatype and the same count to this
* rank.
*
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
* ncclGroupEnd section.
*/
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream);
/*
* Gather
*
* Root device gathers sendcount values from other GPUs into recvbuff,
* receiving data from rank i at offset i*sendcount.
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
* should have a size of at least nranks*sendcount elements.
*
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
*/
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
/*
* Scatter
*
* Scattered over the devices so that recvbuff on rank i will contain the i-th
* block of the data on root.
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
* should have a size of at least nranks*recvcount elements.
*
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
*/
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/*
* All-To-All
*
* Device (i) send (j)th block of data to device (j) and be placed as (i)th
* block. Each block for sending/receiving has count elements, which means
* that recvbuff and sendbuff should have a size of nranks*count elements.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
/*
* Group semantics
*
* When managing multiple GPUs from a single thread, and since NCCL collective
* calls may perform inter-CPU synchronization, we need to "group" calls for
* different ranks/devices into a single call.
*
* Grouping NCCL calls as being part of the same collective operation is done
* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
* collective calls until the ncclGroupEnd call, which will wait for all calls
* to be complete. Note that for collective communication, ncclGroupEnd only
* guarantees that the operations are enqueued on the streams, not that
* the operation is effectively done.
*
* Both collective communication and ncclCommInitRank can be used in conjunction
* of ncclGroupStart/ncclGroupEnd, but not together.
*
* Group semantics also allow to fuse multiple operations on the same device
* to improve performance (for aggregated collective calls), or to permit
* concurrent progress of multiple send/receive operations.
*/
/*
* Group Start
*
* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
* a single NCCL operation. Nothing will be started on the CUDA stream until
* ncclGroupEnd.
*/
ncclResult_t ncclGroupStart();
ncclResult_t pncclGroupStart();
/*
* Group End
*
* End a group call. Start a fused NCCL operation consisting of all calls since
* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
* need to be called after ncclGroupEnd.
*/
ncclResult_t ncclGroupEnd();
ncclResult_t pncclGroupEnd();
#ifdef __cplusplus
} // end extern "C"
#endif
#endif // end include guard