From 02a94fc552fbedf23b90786c9a8f1d9e4e7be73a Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Thu, 13 Jan 2022 13:33:07 -0800 Subject: [PATCH] topo_expl: update for 2.11.4 (#490) * topo_expl: update for 2.11.4 * topo_expl: revert a few logging changes [ROCm/rccl commit: 369c021992cd8239b8c826fc9e9b1ab849b68339] --- projects/rccl/tools/topo_expl/include/nccl.h | 333 +++++++++++++++---- projects/rccl/tools/topo_expl/topo_expl.cpp | 1 + projects/rccl/tools/topo_expl/utils.cpp | 85 +++-- 3 files changed, 322 insertions(+), 97 deletions(-) diff --git a/projects/rccl/tools/topo_expl/include/nccl.h b/projects/rccl/tools/topo_expl/include/nccl.h index 225b150127..fc8861d2b8 100644 --- a/projects/rccl/tools/topo_expl/include/nccl.h +++ b/projects/rccl/tools/topo_expl/include/nccl.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,30 +8,32 @@ #ifndef NCCL_H_ #define NCCL_H_ -#include +#include #include #define NCCL_MAJOR 2 -#define NCCL_MINOR 6 -#define NCCL_PATCH 2 +#define NCCL_MINOR 11 +#define NCCL_PATCH 4 #define NCCL_SUFFIX "" -#define NCCL_VERSION_CODE 2602 -#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z)) +#define NCCL_VERSION_CODE 21104 +#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) #define RCCL_BFLOAT16 1 +#define RCCL_GATHER_SCATTER 1 +#define RCCL_ALLTOALLV 1 #ifdef __cplusplus extern "C" { #endif -/* Opaque handle to communicator */ +/*! @brief Opaque handle to communicator */ typedef struct ncclComm* ncclComm_t; #define NCCL_UNIQUE_ID_BYTES 128 typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; -/* Error type */ +/*! @brief Error type */ typedef enum { ncclSuccess = 0, ncclUnhandledCudaError = 1, ncclSystemError = 2, @@ -40,75 +42,125 @@ typedef enum { ncclSuccess = 0, ncclInvalidUsage = 5, ncclNumResults = 6 } ncclResult_t; -/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. - * This integer is coded with the MAJOR, MINOR and PATCH level of the +/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. + * + * @details This integer is coded with the MAJOR, MINOR and PATCH level of the * NCCL library */ ncclResult_t ncclGetVersion(int *version); +/// @cond include_hidden ncclResult_t pncclGetVersion(int *version); +/// @endcond -/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be - * called once and the Id should be distributed to all ranks in the - * communicator before calling ncclCommInitRank. */ +/*! @brief Generates an ID for ncclCommInitRank + + @details + Generates an ID to be used in ncclCommInitRank. ncclGetUniqueId should be + called once and the Id should be distributed to all ranks in the + communicator before calling ncclCommInitRank. + + @param[in] + uniqueId ncclUniqueId* + pointer to uniqueId + +*/ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); +/// @cond include_hidden ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); +/// @endcond -/* Creates a new communicator (multi thread/process version). - * rank must be between 0 and nranks-1 and unique within a communicator clique. - * Each rank is associated to a CUDA device, which has to be set before calling - * ncclCommInitRank. - * ncclCommInitRank implicitly syncronizes with other ranks, so it must be - * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ +/*! @brief Creates a new communicator (multi thread/process version). + + @details + rank must be between 0 and nranks-1 and unique within a communicator clique. + Each rank is associated to a CUDA device, which has to be set before calling + ncclCommInitRank. + ncclCommInitRank implicitly syncronizes with other ranks, so it must be + called by different threads/processes or use ncclGroupStart/ncclGroupEnd. + + @param[in] + comm ncclComm_t* + communicator struct pointer + */ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); +/// @cond include_hidden ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); +/// @endcond -/* Creates a clique of communicators (single process version). - * This is a convenience function to create a single-process communicator clique. +/*! @brief Creates a clique of communicators (single process version). + * + * @details This is a convenience function to create a single-process communicator clique. * Returns an array of ndev newly initialized communicators in comm. * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). - * If devlist is NULL, the first ndev CUDA devices are used. - * Order of devlist defines user-order of processors within the communicator. */ + * If devlist is NULL, the first ndev HIP devices are used. + * Order of devlist defines user-order of processors within the communicator. + * */ ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); +/// @cond include_hidden ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); +/// @endcond -/* Frees resources associated with communicator object, but waits for any operations - * that might still be running on the device. */ + /*! @brief Frees resources associated with communicator object, but waits for any operations that might still be running on the device */ ncclResult_t ncclCommDestroy(ncclComm_t comm); +/// @cond include_hidden ncclResult_t pncclCommDestroy(ncclComm_t comm); +/// @endcond -/* Frees resources associated with communicator object and aborts any operations - * that might still be running on the device. */ +/*! @brief Frees resources associated with communicator object and aborts any operations that might still be running on the device. */ ncclResult_t ncclCommAbort(ncclComm_t comm); +/// @cond include_hidden ncclResult_t pncclCommAbort(ncclComm_t comm); +/// @endcond -/* Returns a human-readable error message. */ +/*! @brief Returns a human-readable error message. */ const char* ncclGetErrorString(ncclResult_t result); const char* pncclGetErrorString(ncclResult_t result); -/* Checks whether the comm has encountered any asynchronous errors */ +/*! @brief Checks whether the comm has encountered any asynchronous errors */ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); +/// @cond include_hidden ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); +/// @endcond -/* Gets the number of ranks in the communicator clique. */ +/*! @brief Gets the number of ranks in the communicator clique. */ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); +/// @cond include_hidden ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); +/// @endcond -/* Returns the cuda device number associated with the communicator. */ +/*! @brief Returns the rocm device number associated with the communicator. */ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); +/// @cond include_hidden ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); +/// @endcond -/* Returns the user-ordered "rank" associated with the communicator. */ +/*! @brief Returns the user-ordered "rank" associated with the communicator. */ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); +/// @cond include_hidden ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); +/// @endcond +/*! @brief Reduction operation selector */ /* Reduction operation selector */ +typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; typedef enum { ncclSum = 0, ncclProd = 1, ncclMax = 2, ncclMin = 3, - ncclNumOps = 4 } ncclRedOp_t; + ncclAvg = 4, + /* ncclNumOps: The number of built-in ncclRedOp_t values. Also + * serves as the least possible value for dynamic ncclRedOp_t's + * as constructed by ncclRedOpCreate*** functions. */ + ncclNumOps = 5, + /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. + * It is defined to be the largest signed value (since compilers + * are permitted to use signed enums) that won't grow + * sizeof(ncclRedOp_t) when compared to previous NCCL versions to + * maintain ABI compatibility. */ + ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) + } ncclRedOp_t; -/* Data types */ +/*! @brief Data types */ typedef enum { ncclInt8 = 0, ncclChar = 0, ncclUint8 = 1, ncclInt32 = 2, ncclInt = 2, @@ -121,6 +173,40 @@ typedef enum { ncclInt8 = 0, ncclChar = 0, ncclBfloat16 = 9, ncclNumTypes = 10 } ncclDataType_t; +/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ +typedef enum { + /* ncclScalarDevice: The scalar is in device-visible memory and will be + * dereferenced while the collective is running. */ + ncclScalarDevice = 0, + + /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be + * dereferenced before the ncclRedOpCreate***() function returns. */ + ncclScalarHostImmediate = 1 +} ncclScalarResidence_t; + +/* + * ncclRedOpCreatePreMulSum + * + * Creates a new reduction operator which pre-multiplies input values by a given + * scalar locally before reducing them with peer values via summation. For use + * only with collectives launched against *comm* and *datatype*. The + * *residence* argument indicates how/when the memory pointed to by *scalar* + * will be dereferenced. Upon return, the newly created operator's handle + * is stored in *op*. + */ +ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); +ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); + +/* + * ncclRedOpDestroy + * + * Destroys the reduction operator *op*. The operator must have been created by + * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be + * destroyed as soon as the last NCCL function which is given that operator returns. + */ +ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); +ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); + /* * Collective communication operations * @@ -134,10 +220,10 @@ typedef enum { ncclInt8 = 0, ncclChar = 0, * below). */ -/* - * Reduce +/*! + * @brief Reduce * - * Reduces data arrays of length count in sendbuff into recvbuff using op + * @details Reduces data arrays of length count in sendbuff into recvbuff using op * operation. * recvbuff may be NULL on all calls except for root device. * root is the rank (not the CUDA device) where data will reside after the @@ -147,13 +233,14 @@ typedef enum { ncclInt8 = 0, ncclChar = 0, */ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); +/// @cond include_hidden ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); +/// @endcond -/* - * (deprecated) Broadcast (in-place) +/*! @brief (deprecated) Broadcast (in-place) * - * Copies count values from root to all other devices. + * @details Copies count values from root to all other devices. * root is the rank (not the CUDA device) where data resides before the * operation is started. * @@ -161,40 +248,44 @@ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncc */ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); +/// @cond include_hidden ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); +/// @endcond -/* - * Broadcast +/*! @brief Broadcast * - * Copies count values from root to all other devices. - * root is the rank (not the CUDA device) where data resides before the + * @details Copies count values from root to all other devices. + * root is the rank (not the HIP device) where data resides before the * operation is started. * * In-place operation will happen if sendbuff == recvbuff. */ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); +/// @cond include_hidden ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); +/// @endcond -/* - * All-Reduce +/*! @brief All-Reduce * - * Reduces data arrays of length count in sendbuff using op operation, and + * @details Reduces data arrays of length count in sendbuff using op operation, and * leaves identical copies of result on each recvbuff. * * In-place operation will happen if sendbuff == recvbuff. */ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); +/// @cond include_hidden ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); +/// @endcond -/* - * Reduce-Scatter +/*! + * @brief Reduce-Scatter * - * Reduces data in sendbuff using op operation and leaves reduced result + * @details Reduces data in sendbuff using op operation and leaves reduced result * scattered over the devices so that recvbuff on rank i will contain the i-th * block of the result. * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff @@ -205,14 +296,15 @@ ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); +/// @cond include_hidden ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); +/// @endcond -/* - * All-Gather +/*! @brief All-Gather * - * Each device gathers sendcount values from other GPUs into recvbuff, + * @details Each device gathers sendcount values from other GPUs into recvbuff, * receiving data from rank i at offset i*sendcount. * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff * should have a size of at least nranks*sendcount elements. @@ -221,8 +313,115 @@ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, */ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); +/// @cond include_hidden ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); +/// @endcond + +/*! @brief Send + * + * @details Send data from sendbuff to rank peer. + * Rank peer needs to call ncclRecv with the same datatype and the same count from this + * rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ + * ncclGroupEnd section. + */ +ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, hipStream_t stream); +/// @cond include_hidden +ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, hipStream_t stream); +/// @endcond + +/*! @brief Receive + * + * @details Receive data from rank peer into recvbuff. + * Rank peer needs to call ncclSend with the same datatype and the same count to this + * rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ + * ncclGroupEnd section. + */ +/// @cond include_hidden +ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, hipStream_t stream); +ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, hipStream_t stream); +/// @endcond + +/*! @brief Gather + * + * @details Root device gathers sendcount values from other GPUs into recvbuff, + * receiving data from rank i at offset i*sendcount. + * + * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff + * should have a size of at least nranks*sendcount elements. + * + * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + */ +ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); +/// @cond include_hidden +ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); +/// @endcond + +/*! @brief Scatter + * + * @details Scattered over the devices so that recvbuff on rank i will contain the i-th + * block of the data on root. + * + * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff + * should have a size of at least nranks*recvcount elements. + * + * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + */ +ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); +/// @cond include_hidden +ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); +/// @endcond + +/*! @brief All-To-All + * + * @details Device (i) send (j)th block of data to device (j) and be placed as (i)th + * block. Each block for sending/receiving has count elements, which means + * that recvbuff and sendbuff should have a size of nranks*count elements. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); +/// @cond include_hidden +ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); +/// @endcond + +/*! @brief All-To-Allv + * + * @details Device (i) sends sendcounts[j] of data from offset sdispls[j] + * to device (j). In the same time, device (i) receives recvcounts[j] of data + * from device (j) to be placed at rdispls[j]. + + * sendcounts, sdispls, recvcounts and rdispls are all measured in the units + * of datatype, not bytes. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], + const size_t sdispls[], void *recvbuff, const size_t recvcounts[], + const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); +/// @cond include_hidden +ncclResult_t pncclAllToAllv(const void *sendbuff, const size_t sendcounts[], + const size_t sdispls[], void *recvbuff, const size_t recvcounts[], + const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); +/// @endcond /* * Group semantics @@ -239,26 +438,34 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou * the operation is effectively done. * * Both collective communication and ncclCommInitRank can be used in conjunction - * of ncclGroupStart/ncclGroupEnd. + * of ncclGroupStart/ncclGroupEnd, but not together. + * + * Group semantics also allow to fuse multiple operations on the same device + * to improve performance (for aggregated collective calls), or to permit + * concurrent progress of multiple send/receive operations. */ -/* - * Group Start +/*! @brief Group Start * - * Start a group call. All subsequent calls to NCCL may not block due to - * inter-CPU synchronization. + * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into + * a single NCCL operation. Nothing will be started on the CUDA stream until + * ncclGroupEnd. */ ncclResult_t ncclGroupStart(); +/// @cond include_hidden ncclResult_t pncclGroupStart(); +/// @endcond -/* - * Group End +/*! @brief Group End * - * End a group call. Wait for all calls since ncclGroupStart to complete - * before returning. + * End a group call. Start a fused NCCL operation consisting of all calls since + * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations + * need to be called after ncclGroupEnd. */ ncclResult_t ncclGroupEnd(); +/// @cond include_hidden ncclResult_t pncclGroupEnd(); +/// @endcond #ifdef __cplusplus } // end extern "C" diff --git a/projects/rccl/tools/topo_expl/topo_expl.cpp b/projects/rccl/tools/topo_expl/topo_expl.cpp index 4aa8dcb477..e051cae38c 100644 --- a/projects/rccl/tools/topo_expl/topo_expl.cpp +++ b/projects/rccl/tools/topo_expl/topo_expl.cpp @@ -209,6 +209,7 @@ int main(int argc,char* argv[]) // Mark channels as non initialized. for (int c=0; cnRanks)); } struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph; diff --git a/projects/rccl/tools/topo_expl/utils.cpp b/projects/rccl/tools/topo_expl/utils.cpp index 5ad3a24521..eb7d1e331c 100644 --- a/projects/rccl/tools/topo_expl/utils.cpp +++ b/projects/rccl/tools/topo_expl/utils.cpp @@ -200,18 +200,8 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, return ncclSuccess; } -static ncclResult_t connectedByXGMI(int* ret, struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { - *ret = 0; - if (info1->hostHash != info2->hostHash) return ncclSuccess; - int g1, g2; - NCCLCHECK(ncclTopoRankToIndex(system, info1->rank, &g1)); - NCCLCHECK(ncclTopoRankToIndex(system, info2->rank, &g2)); - if (system->nodes[GPU].nodes[g1].paths[GPU][g2].type == PATH_NVL) *ret = 1; - return ncclSuccess; -} - template -static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex) { +static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex, int* transportType) { struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank; struct ncclPeerInfo* peerInfo = comm->peerInfo+peer; struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex : @@ -224,8 +214,8 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, peer, graph, channelId, (type == 1) ? 0 : 1, &n2)); } - int xgmi; - NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, myInfo, peerInfo)); + bool xgmi; + NCCLCHECK(ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &xgmi)); for (int t=0; t= 0 && n2 >= 0 && t != TRANSPORT_NET) continue; @@ -236,6 +226,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* if (ret) { connector->transportComm = transportComm; NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex)); + if (transportType) *transportType = t; return ncclSuccess; } } @@ -259,10 +250,13 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* return ncclSuccess; } -ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex) { +ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) { +#if CUDART_VERSION >= 11030 // Stream used during transport setup; need for P2P pre-connect + CUDA Graph - //hipStream_t transportSetupStream; - //CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking)); + hipStream_t transportSetupStream; + CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking)); +#endif + int highestType = TRANSPORT_P2P; // track highest transport type struct ncclConnect data[2*MAXCHANNELS]; for (int i=1; inRanks; i++) { @@ -274,15 +268,18 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* struct ncclConnect* recvData = data; int sendChannels = 0, recvChannels = 0; + int type; for (int c=0; c(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex)); + NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type)); + if (type > highestType) highestType = type; } } struct ncclConnect* sendData = recvData+recvChannels; for (int c=0; c(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex)); + NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type)); + if (type > highestType) highestType = type; } } @@ -305,7 +302,11 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex; //NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn)); conn->connected = 1; +#if CUDART_VERSION >= 11030 //CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream)); +#else + //CUDACHECK(hipMemcpy(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); +#endif } } for (int c=0; cchannels[c].peers[recvPeer].recv + connIndex; //NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn)); conn->connected = 1; +#if CUDART_VERSION >= 11030 //CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream)); +#else + //CUDACHECK(hipMemcpy(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); +#endif } } - comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0; + comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0; } - //CUDACHECK(hipStreamSynchronize(transportSetupStream)); - //CUDACHECK(hipStreamDestroy(transportSetupStream)); +#if CUDART_VERSION >= 11030 + CUDACHECK(hipStreamSynchronize(transportSetupStream)); + CUDACHECK(hipStreamDestroy(transportSetupStream)); +#endif + if (highestTransportType != NULL) *highestTransportType = highestType; return ncclSuccess; } @@ -413,22 +421,18 @@ cleanup: } ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) { - int rank = comm->rank; - int nranks = comm->nRanks; // AllGather collNet setup results - int* allGatherFailures; - NCCLCHECK(ncclCalloc(&allGatherFailures, nranks)); - allGatherFailures[rank] = collNetSetupFail; - //NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int))); - for (int i=0; iintraNodeRank] = collNetSetupFail; + //NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, allGatherFailures, sizeof(int))); + for (int i=0; ilocalRanks; i++) { if (allGatherFailures[i] != 0) { collNetSetupFail = 1; break; } } - free(allGatherFailures); if (collNetSetupFail) { - if (rank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead"); + if (comm->intraNodeRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead"); return ncclSystemError; } return ncclSuccess; @@ -489,13 +493,14 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t int intraNodeRank0 = -1, intraNodeRank = -1, intraNodeRanks = 0; int myCompCap = allGather1Data[rank].cudaCompCap; int minCompCap = myCompCap, maxCompCap = myCompCap; - int intraNodeGlobalRanks[256]; for (int i = 0; i < nranks; i++) { if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) { // Rank is on same node if (intraNodeRanks == 0) intraNodeRank0 = i; if (i == rank) intraNodeRank = intraNodeRanks; - intraNodeGlobalRanks[intraNodeRanks++] = i; + comm->intraNodeGlobalRanks[intraNodeRanks] = i; + comm->rankToIntraNodeRank[i] = intraNodeRanks; + intraNodeRanks++; if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) { // Rank is in same process if (intraProcRanks == 0) intraProcRank0 = i; @@ -524,6 +529,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t } struct ncclComm* intraProcRank0Comm = allGather1Data[intraProcRank0].comm; uint64_t intraNodeRank0pidHash = allGather1Data[intraNodeRank0].peerInfo.pidHash; + comm->intraNodeRank = intraNodeRank; // AllGather1 - end @@ -846,6 +852,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t // Check if we can setup CollNet if (comm->collNetSupport > 0) { int collNetSetupFail = 0; + int highestTypes[NCCL_MAX_INTRA_RANKS] = {TRANSPORT_P2P}; // Find all head ranks int nHeads = collNetGraph.nChannels; int *heads; @@ -871,16 +878,26 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank); // Connect intra-node CollNet + int highestTransportType0, highestTransportType1; for (int c=0; cnChannels; c++) { struct ncclChannel* channelRecv = comm->channels+c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup); } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup); for (int c=0; cnChannels; c++) { struct ncclChannel* channelSend = comm->channels+c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup); } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup); + + // Exchange highest intra-node transport type among ranks + // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer + comm->intraHighestTransportType = highestTypes[comm->intraNodeRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; + //NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, highestTypes, sizeof(int))); + //for (int i=0; ilocalRanks; i++) { + //if (highestTypes[i] > comm->intraHighestTransportType) + //comm->intraHighestTransportType = highestTypes[i]; + //} INFO(NCCL_INIT, "rank %d Connected CollNet", rank); collnet_cleanup: