topo_expl: update for 2.11.4 (#490)
* topo_expl: update for 2.11.4
* topo_expl: revert a few logging changes
[ROCm/rccl commit: 369c021992]
Dieser Commit ist enthalten in:
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,30 +8,32 @@
|
||||
#ifndef NCCL_H_
|
||||
#define NCCL_H_
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
#define NCCL_MAJOR 2
|
||||
#define NCCL_MINOR 6
|
||||
#define NCCL_PATCH 2
|
||||
#define NCCL_MINOR 11
|
||||
#define NCCL_PATCH 4
|
||||
#define NCCL_SUFFIX ""
|
||||
|
||||
#define NCCL_VERSION_CODE 2602
|
||||
#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
|
||||
#define NCCL_VERSION_CODE 21104
|
||||
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
|
||||
|
||||
#define RCCL_BFLOAT16 1
|
||||
#define RCCL_GATHER_SCATTER 1
|
||||
#define RCCL_ALLTOALLV 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Opaque handle to communicator */
|
||||
/*! @brief Opaque handle to communicator */
|
||||
typedef struct ncclComm* ncclComm_t;
|
||||
|
||||
#define NCCL_UNIQUE_ID_BYTES 128
|
||||
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
|
||||
|
||||
/* Error type */
|
||||
/*! @brief Error type */
|
||||
typedef enum { ncclSuccess = 0,
|
||||
ncclUnhandledCudaError = 1,
|
||||
ncclSystemError = 2,
|
||||
@@ -40,75 +42,125 @@ typedef enum { ncclSuccess = 0,
|
||||
ncclInvalidUsage = 5,
|
||||
ncclNumResults = 6 } ncclResult_t;
|
||||
|
||||
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
|
||||
* This integer is coded with the MAJOR, MINOR and PATCH level of the
|
||||
/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
|
||||
*
|
||||
* @details This integer is coded with the MAJOR, MINOR and PATCH level of the
|
||||
* NCCL library
|
||||
*/
|
||||
ncclResult_t ncclGetVersion(int *version);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGetVersion(int *version);
|
||||
/// @endcond
|
||||
|
||||
/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
|
||||
* called once and the Id should be distributed to all ranks in the
|
||||
* communicator before calling ncclCommInitRank. */
|
||||
/*! @brief Generates an ID for ncclCommInitRank
|
||||
|
||||
@details
|
||||
Generates an ID to be used in ncclCommInitRank. ncclGetUniqueId should be
|
||||
called once and the Id should be distributed to all ranks in the
|
||||
communicator before calling ncclCommInitRank.
|
||||
|
||||
@param[in]
|
||||
uniqueId ncclUniqueId*
|
||||
pointer to uniqueId
|
||||
|
||||
*/
|
||||
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
/// @endcond
|
||||
|
||||
/* Creates a new communicator (multi thread/process version).
|
||||
* rank must be between 0 and nranks-1 and unique within a communicator clique.
|
||||
* Each rank is associated to a CUDA device, which has to be set before calling
|
||||
* ncclCommInitRank.
|
||||
* ncclCommInitRank implicitly syncronizes with other ranks, so it must be
|
||||
* called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
|
||||
/*! @brief Creates a new communicator (multi thread/process version).
|
||||
|
||||
@details
|
||||
rank must be between 0 and nranks-1 and unique within a communicator clique.
|
||||
Each rank is associated to a CUDA device, which has to be set before calling
|
||||
ncclCommInitRank.
|
||||
ncclCommInitRank implicitly syncronizes with other ranks, so it must be
|
||||
called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
|
||||
|
||||
@param[in]
|
||||
comm ncclComm_t*
|
||||
communicator struct pointer
|
||||
*/
|
||||
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
|
||||
/// @endcond
|
||||
|
||||
/* Creates a clique of communicators (single process version).
|
||||
* This is a convenience function to create a single-process communicator clique.
|
||||
/*! @brief Creates a clique of communicators (single process version).
|
||||
*
|
||||
* @details This is a convenience function to create a single-process communicator clique.
|
||||
* Returns an array of ndev newly initialized communicators in comm.
|
||||
* comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
|
||||
* If devlist is NULL, the first ndev CUDA devices are used.
|
||||
* Order of devlist defines user-order of processors within the communicator. */
|
||||
* If devlist is NULL, the first ndev HIP devices are used.
|
||||
* Order of devlist defines user-order of processors within the communicator.
|
||||
* */
|
||||
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||
/// @endcond
|
||||
|
||||
/* Frees resources associated with communicator object, but waits for any operations
|
||||
* that might still be running on the device. */
|
||||
/*! @brief Frees resources associated with communicator object, but waits for any operations that might still be running on the device */
|
||||
ncclResult_t ncclCommDestroy(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommDestroy(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/* Frees resources associated with communicator object and aborts any operations
|
||||
* that might still be running on the device. */
|
||||
/*! @brief Frees resources associated with communicator object and aborts any operations that might still be running on the device. */
|
||||
ncclResult_t ncclCommAbort(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommAbort(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/* Returns a human-readable error message. */
|
||||
/*! @brief Returns a human-readable error message. */
|
||||
const char* ncclGetErrorString(ncclResult_t result);
|
||||
const char* pncclGetErrorString(ncclResult_t result);
|
||||
|
||||
/* Checks whether the comm has encountered any asynchronous errors */
|
||||
/*! @brief Checks whether the comm has encountered any asynchronous errors */
|
||||
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||
/// @endcond
|
||||
|
||||
/* Gets the number of ranks in the communicator clique. */
|
||||
/*! @brief Gets the number of ranks in the communicator clique. */
|
||||
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
|
||||
/// @endcond
|
||||
|
||||
/* Returns the cuda device number associated with the communicator. */
|
||||
/*! @brief Returns the rocm device number associated with the communicator. */
|
||||
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
/// @endcond
|
||||
|
||||
/* Returns the user-ordered "rank" associated with the communicator. */
|
||||
/*! @brief Returns the user-ordered "rank" associated with the communicator. */
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Reduction operation selector */
|
||||
/* Reduction operation selector */
|
||||
typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
|
||||
typedef enum { ncclSum = 0,
|
||||
ncclProd = 1,
|
||||
ncclMax = 2,
|
||||
ncclMin = 3,
|
||||
ncclNumOps = 4 } ncclRedOp_t;
|
||||
ncclAvg = 4,
|
||||
/* ncclNumOps: The number of built-in ncclRedOp_t values. Also
|
||||
* serves as the least possible value for dynamic ncclRedOp_t's
|
||||
* as constructed by ncclRedOpCreate*** functions. */
|
||||
ncclNumOps = 5,
|
||||
/* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
|
||||
* It is defined to be the largest signed value (since compilers
|
||||
* are permitted to use signed enums) that won't grow
|
||||
* sizeof(ncclRedOp_t) when compared to previous NCCL versions to
|
||||
* maintain ABI compatibility. */
|
||||
ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
|
||||
} ncclRedOp_t;
|
||||
|
||||
/* Data types */
|
||||
/*! @brief Data types */
|
||||
typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||
ncclUint8 = 1,
|
||||
ncclInt32 = 2, ncclInt = 2,
|
||||
@@ -121,6 +173,40 @@ typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||
ncclBfloat16 = 9,
|
||||
ncclNumTypes = 10 } ncclDataType_t;
|
||||
|
||||
/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
|
||||
typedef enum {
|
||||
/* ncclScalarDevice: The scalar is in device-visible memory and will be
|
||||
* dereferenced while the collective is running. */
|
||||
ncclScalarDevice = 0,
|
||||
|
||||
/* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
|
||||
* dereferenced before the ncclRedOpCreate***() function returns. */
|
||||
ncclScalarHostImmediate = 1
|
||||
} ncclScalarResidence_t;
|
||||
|
||||
/*
|
||||
* ncclRedOpCreatePreMulSum
|
||||
*
|
||||
* Creates a new reduction operator which pre-multiplies input values by a given
|
||||
* scalar locally before reducing them with peer values via summation. For use
|
||||
* only with collectives launched against *comm* and *datatype*. The
|
||||
* *residence* argument indicates how/when the memory pointed to by *scalar*
|
||||
* will be dereferenced. Upon return, the newly created operator's handle
|
||||
* is stored in *op*.
|
||||
*/
|
||||
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
||||
ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
||||
|
||||
/*
|
||||
* ncclRedOpDestroy
|
||||
*
|
||||
* Destroys the reduction operator *op*. The operator must have been created by
|
||||
* ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
|
||||
* destroyed as soon as the last NCCL function which is given that operator returns.
|
||||
*/
|
||||
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||
ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||
|
||||
/*
|
||||
* Collective communication operations
|
||||
*
|
||||
@@ -134,10 +220,10 @@ typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||
* below).
|
||||
*/
|
||||
|
||||
/*
|
||||
* Reduce
|
||||
/*!
|
||||
* @brief Reduce
|
||||
*
|
||||
* Reduces data arrays of length count in sendbuff into recvbuff using op
|
||||
* @details Reduces data arrays of length count in sendbuff into recvbuff using op
|
||||
* operation.
|
||||
* recvbuff may be NULL on all calls except for root device.
|
||||
* root is the rank (not the CUDA device) where data will reside after the
|
||||
@@ -147,13 +233,14 @@ typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||
*/
|
||||
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* (deprecated) Broadcast (in-place)
|
||||
/*! @brief (deprecated) Broadcast (in-place)
|
||||
*
|
||||
* Copies count values from root to all other devices.
|
||||
* @details Copies count values from root to all other devices.
|
||||
* root is the rank (not the CUDA device) where data resides before the
|
||||
* operation is started.
|
||||
*
|
||||
@@ -161,40 +248,44 @@ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncc
|
||||
*/
|
||||
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* Broadcast
|
||||
/*! @brief Broadcast
|
||||
*
|
||||
* Copies count values from root to all other devices.
|
||||
* root is the rank (not the CUDA device) where data resides before the
|
||||
* @details Copies count values from root to all other devices.
|
||||
* root is the rank (not the HIP device) where data resides before the
|
||||
* operation is started.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* All-Reduce
|
||||
/*! @brief All-Reduce
|
||||
*
|
||||
* Reduces data arrays of length count in sendbuff using op operation, and
|
||||
* @details Reduces data arrays of length count in sendbuff using op operation, and
|
||||
* leaves identical copies of result on each recvbuff.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* Reduce-Scatter
|
||||
/*!
|
||||
* @brief Reduce-Scatter
|
||||
*
|
||||
* Reduces data in sendbuff using op operation and leaves reduced result
|
||||
* @details Reduces data in sendbuff using op operation and leaves reduced result
|
||||
* scattered over the devices so that recvbuff on rank i will contain the i-th
|
||||
* block of the result.
|
||||
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
|
||||
@@ -205,14 +296,15 @@ ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* All-Gather
|
||||
/*! @brief All-Gather
|
||||
*
|
||||
* Each device gathers sendcount values from other GPUs into recvbuff,
|
||||
* @details Each device gathers sendcount values from other GPUs into recvbuff,
|
||||
* receiving data from rank i at offset i*sendcount.
|
||||
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
|
||||
* should have a size of at least nranks*sendcount elements.
|
||||
@@ -221,8 +313,115 @@ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||
*/
|
||||
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Send
|
||||
*
|
||||
* @details Send data from sendbuff to rank peer.
|
||||
* Rank peer needs to call ncclRecv with the same datatype and the same count from this
|
||||
* rank.
|
||||
*
|
||||
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
|
||||
* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
|
||||
* ncclGroupEnd section.
|
||||
*/
|
||||
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Receive
|
||||
*
|
||||
* @details Receive data from rank peer into recvbuff.
|
||||
* Rank peer needs to call ncclSend with the same datatype and the same count to this
|
||||
* rank.
|
||||
*
|
||||
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
|
||||
* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
|
||||
* ncclGroupEnd section.
|
||||
*/
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Gather
|
||||
*
|
||||
* @details Root device gathers sendcount values from other GPUs into recvbuff,
|
||||
* receiving data from rank i at offset i*sendcount.
|
||||
*
|
||||
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
|
||||
* should have a size of at least nranks*sendcount elements.
|
||||
*
|
||||
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
|
||||
*/
|
||||
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Scatter
|
||||
*
|
||||
* @details Scattered over the devices so that recvbuff on rank i will contain the i-th
|
||||
* block of the data on root.
|
||||
*
|
||||
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
|
||||
* should have a size of at least nranks*recvcount elements.
|
||||
*
|
||||
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
|
||||
*/
|
||||
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief All-To-All
|
||||
*
|
||||
* @details Device (i) send (j)th block of data to device (j) and be placed as (i)th
|
||||
* block. Each block for sending/receiving has count elements, which means
|
||||
* that recvbuff and sendbuff should have a size of nranks*count elements.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief All-To-Allv
|
||||
*
|
||||
* @details Device (i) sends sendcounts[j] of data from offset sdispls[j]
|
||||
* to device (j). In the same time, device (i) receives recvcounts[j] of data
|
||||
* from device (j) to be placed at rdispls[j].
|
||||
|
||||
* sendcounts, sdispls, recvcounts and rdispls are all measured in the units
|
||||
* of datatype, not bytes.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
|
||||
const size_t sdispls[], void *recvbuff, const size_t recvcounts[],
|
||||
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllToAllv(const void *sendbuff, const size_t sendcounts[],
|
||||
const size_t sdispls[], void *recvbuff, const size_t recvcounts[],
|
||||
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* Group semantics
|
||||
@@ -239,26 +438,34 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
|
||||
* the operation is effectively done.
|
||||
*
|
||||
* Both collective communication and ncclCommInitRank can be used in conjunction
|
||||
* of ncclGroupStart/ncclGroupEnd.
|
||||
* of ncclGroupStart/ncclGroupEnd, but not together.
|
||||
*
|
||||
* Group semantics also allow to fuse multiple operations on the same device
|
||||
* to improve performance (for aggregated collective calls), or to permit
|
||||
* concurrent progress of multiple send/receive operations.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Group Start
|
||||
/*! @brief Group Start
|
||||
*
|
||||
* Start a group call. All subsequent calls to NCCL may not block due to
|
||||
* inter-CPU synchronization.
|
||||
* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
|
||||
* a single NCCL operation. Nothing will be started on the CUDA stream until
|
||||
* ncclGroupEnd.
|
||||
*/
|
||||
ncclResult_t ncclGroupStart();
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGroupStart();
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* Group End
|
||||
/*! @brief Group End
|
||||
*
|
||||
* End a group call. Wait for all calls since ncclGroupStart to complete
|
||||
* before returning.
|
||||
* End a group call. Start a fused NCCL operation consisting of all calls since
|
||||
* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
|
||||
* need to be called after ncclGroupEnd.
|
||||
*/
|
||||
ncclResult_t ncclGroupEnd();
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGroupEnd();
|
||||
/// @endcond
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // end extern "C"
|
||||
|
||||
@@ -209,6 +209,7 @@ int main(int argc,char* argv[])
|
||||
// Mark channels as non initialized.
|
||||
for (int c=0; c<MAXCHANNELS; c++) comm[i].channels[c].id = -1;
|
||||
NCCLCHECK(ncclCalloc((uint32_t**)&comm[i].p2pNet, 1));
|
||||
NCCLCHECK(ncclCalloc(&comm[i].rankToIntraNodeRank, comm->nRanks));
|
||||
}
|
||||
|
||||
struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph;
|
||||
|
||||
@@ -200,18 +200,8 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t connectedByXGMI(int* ret, struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = 0;
|
||||
if (info1->hostHash != info2->hostHash) return ncclSuccess;
|
||||
int g1, g2;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, info1->rank, &g1));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, info2->rank, &g2));
|
||||
if (system->nodes[GPU].nodes[g1].paths[GPU][g2].type == PATH_NVL) *ret = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <int type>
|
||||
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex) {
|
||||
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex, int* transportType) {
|
||||
struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank;
|
||||
struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
|
||||
struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
|
||||
@@ -224,8 +214,8 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, peer, graph, channelId, (type == 1) ? 0 : 1, &n2));
|
||||
}
|
||||
|
||||
int xgmi;
|
||||
NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, myInfo, peerInfo));
|
||||
bool xgmi;
|
||||
NCCLCHECK(ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &xgmi));
|
||||
for (int t=0; t<NTRANSPORTS; t++) {
|
||||
if (graph == NULL && connIndex == NCCL_CONN_IDX_P2P_NET && (t == TRANSPORT_SHM || (!xgmi && t == TRANSPORT_P2P))) continue;
|
||||
if (graph && n1 >= 0 && n2 >= 0 && t != TRANSPORT_NET) continue;
|
||||
@@ -236,6 +226,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
if (ret) {
|
||||
connector->transportComm = transportComm;
|
||||
NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex));
|
||||
if (transportType) *transportType = t;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
@@ -259,10 +250,13 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel*
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex) {
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
|
||||
//hipStream_t transportSetupStream;
|
||||
//CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
|
||||
hipStream_t transportSetupStream;
|
||||
CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
|
||||
#endif
|
||||
int highestType = TRANSPORT_P2P; // track highest transport type
|
||||
|
||||
struct ncclConnect data[2*MAXCHANNELS];
|
||||
for (int i=1; i<comm->nRanks; i++) {
|
||||
@@ -274,15 +268,18 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
|
||||
struct ncclConnect* recvData = data;
|
||||
int sendChannels = 0, recvChannels = 0;
|
||||
int type;
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
if (recvMask & (1<<c)) {
|
||||
NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex));
|
||||
NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type));
|
||||
if (type > highestType) highestType = type;
|
||||
}
|
||||
}
|
||||
struct ncclConnect* sendData = recvData+recvChannels;
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
if (sendMask & (1<<c)) {
|
||||
NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex));
|
||||
NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type));
|
||||
if (type > highestType) highestType = type;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -305,7 +302,11 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
|
||||
//NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
#if CUDART_VERSION >= 11030
|
||||
//CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
|
||||
#else
|
||||
//CUDACHECK(hipMemcpy(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
@@ -313,13 +314,20 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
|
||||
//NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
#if CUDART_VERSION >= 11030
|
||||
//CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
|
||||
#else
|
||||
//CUDACHECK(hipMemcpy(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
|
||||
comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0;
|
||||
}
|
||||
//CUDACHECK(hipStreamSynchronize(transportSetupStream));
|
||||
//CUDACHECK(hipStreamDestroy(transportSetupStream));
|
||||
#if CUDART_VERSION >= 11030
|
||||
CUDACHECK(hipStreamSynchronize(transportSetupStream));
|
||||
CUDACHECK(hipStreamDestroy(transportSetupStream));
|
||||
#endif
|
||||
if (highestTransportType != NULL) *highestTransportType = highestType;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -413,22 +421,18 @@ cleanup:
|
||||
}
|
||||
|
||||
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
|
||||
int rank = comm->rank;
|
||||
int nranks = comm->nRanks;
|
||||
// AllGather collNet setup results
|
||||
int* allGatherFailures;
|
||||
NCCLCHECK(ncclCalloc(&allGatherFailures, nranks));
|
||||
allGatherFailures[rank] = collNetSetupFail;
|
||||
//NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int)));
|
||||
for (int i=0; i<nranks; i++) {
|
||||
int allGatherFailures[NCCL_MAX_INTRA_RANKS] = {0};
|
||||
allGatherFailures[comm->intraNodeRank] = collNetSetupFail;
|
||||
//NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, allGatherFailures, sizeof(int)));
|
||||
for (int i=0; i<comm->localRanks; i++) {
|
||||
if (allGatherFailures[i] != 0) {
|
||||
collNetSetupFail = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
free(allGatherFailures);
|
||||
if (collNetSetupFail) {
|
||||
if (rank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
|
||||
if (comm->intraNodeRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -489,13 +493,14 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
|
||||
int intraNodeRank0 = -1, intraNodeRank = -1, intraNodeRanks = 0;
|
||||
int myCompCap = allGather1Data[rank].cudaCompCap;
|
||||
int minCompCap = myCompCap, maxCompCap = myCompCap;
|
||||
int intraNodeGlobalRanks[256];
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) {
|
||||
// Rank is on same node
|
||||
if (intraNodeRanks == 0) intraNodeRank0 = i;
|
||||
if (i == rank) intraNodeRank = intraNodeRanks;
|
||||
intraNodeGlobalRanks[intraNodeRanks++] = i;
|
||||
comm->intraNodeGlobalRanks[intraNodeRanks] = i;
|
||||
comm->rankToIntraNodeRank[i] = intraNodeRanks;
|
||||
intraNodeRanks++;
|
||||
if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) {
|
||||
// Rank is in same process
|
||||
if (intraProcRanks == 0) intraProcRank0 = i;
|
||||
@@ -524,6 +529,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
|
||||
}
|
||||
struct ncclComm* intraProcRank0Comm = allGather1Data[intraProcRank0].comm;
|
||||
uint64_t intraNodeRank0pidHash = allGather1Data[intraNodeRank0].peerInfo.pidHash;
|
||||
comm->intraNodeRank = intraNodeRank;
|
||||
|
||||
// AllGather1 - end
|
||||
|
||||
@@ -846,6 +852,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
// Check if we can setup CollNet
|
||||
if (comm->collNetSupport > 0) {
|
||||
int collNetSetupFail = 0;
|
||||
int highestTypes[NCCL_MAX_INTRA_RANKS] = {TRANSPORT_P2P};
|
||||
// Find all head ranks
|
||||
int nHeads = collNetGraph.nChannels;
|
||||
int *heads;
|
||||
@@ -871,16 +878,26 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
|
||||
|
||||
// Connect intra-node CollNet
|
||||
int highestTransportType0, highestTransportType1;
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channelRecv = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup);
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup);
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channelSend = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup);
|
||||
|
||||
// Exchange highest intra-node transport type among ranks
|
||||
// because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
|
||||
comm->intraHighestTransportType = highestTypes[comm->intraNodeRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
|
||||
//NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, highestTypes, sizeof(int)));
|
||||
//for (int i=0; i<comm->localRanks; i++) {
|
||||
//if (highestTypes[i] > comm->intraHighestTransportType)
|
||||
//comm->intraHighestTransportType = highestTypes[i];
|
||||
//}
|
||||
INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
|
||||
|
||||
collnet_cleanup:
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren