From fcd6cc45bd2cb3ec844e887a9d72e2ef878c5e19 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Thu, 12 Sep 2024 18:24:50 -0500 Subject: [PATCH] Package RCCL headers to support adding RCCL support w/o installed headers (#1075) - in ROCm CI, rocprofiler-sdk gets built before RCCL is installed, this is a workaround for this issue [ROCm/rocprofiler-sdk commit: 8c1382fceb566d9293db6d822ab56eb89d50c2d5] --- .../Templates/rocprofiler-sdk/config.cmake.in | 7 + .../cmake/rocprofiler_config_interfaces.cmake | 14 +- .../include/rocprofiler-sdk/CMakeLists.txt | 1 + .../rocprofiler-sdk/rccl/CMakeLists.txt | 13 + .../include/rocprofiler-sdk/rccl/api_args.h | 18 +- .../rccl/details/CMakeLists.txt | 11 + .../rocprofiler-sdk/rccl/details/api_trace.h | 18 +- .../rocprofiler-sdk/rccl/details/rccl.h | 1081 +++++++++++++++++ .../source/lib/rocprofiler-sdk-tool/tool.cpp | 8 +- .../lib/rocprofiler-sdk/buffer_tracing.cpp | 8 +- .../lib/rocprofiler-sdk/callback_tracing.cpp | 8 +- .../lib/rocprofiler-sdk/rccl/CMakeLists.txt | 2 - .../source/lib/rocprofiler-sdk/rccl/abi.cpp | 88 +- .../rccl/details/CMakeLists.txt | 8 - .../source/lib/rocprofiler-sdk/rccl/rccl.hpp | 16 +- .../rocprofiler-sdk/tests/tools/json-tool.cpp | 36 +- 16 files changed, 1235 insertions(+), 102 deletions(-) create mode 100644 projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/CMakeLists.txt rename projects/rocprofiler-sdk/source/{lib => include}/rocprofiler-sdk/rccl/details/api_trace.h (96%) create mode 100644 projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/rccl.h delete mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/details/CMakeLists.txt diff --git a/projects/rocprofiler-sdk/cmake/Templates/rocprofiler-sdk/config.cmake.in b/projects/rocprofiler-sdk/cmake/Templates/rocprofiler-sdk/config.cmake.in index 8c0f573263..b443563630 100644 --- a/projects/rocprofiler-sdk/cmake/Templates/rocprofiler-sdk/config.cmake.in +++ b/projects/rocprofiler-sdk/cmake/Templates/rocprofiler-sdk/config.cmake.in @@ -99,6 +99,13 @@ else() add_library(@PACKAGE_NAME@::@PACKAGE_NAME@-external-nolink INTERFACE IMPORTED) + # if rccl not found or not found when rocprofiler-sdk + # was built, use the packaged rccl.h and api_trace.h + if(NOT @rccl_FOUND@ OR NOT @rccl_API_TRACE_FOUND@) + target_compile_definitions(@PACKAGE_NAME@::@PACKAGE_NAME@-external-nolink + INTERFACE ROCPROFILER_SDK_USE_SYSTEM_RCCL=0) + endif() + include("${@PACKAGE_NAME@_CMAKE_DIR}/@PACKAGE_NAME@-targets.cmake") @PROJECT_NAME@_config_nolink_target(@PACKAGE_NAME@::@PACKAGE_NAME@-external-nolink hip::host) diff --git a/projects/rocprofiler-sdk/cmake/rocprofiler_config_interfaces.cmake b/projects/rocprofiler-sdk/cmake/rocprofiler_config_interfaces.cmake index d0860f1478..2b9fefd87c 100644 --- a/projects/rocprofiler-sdk/cmake/rocprofiler_config_interfaces.cmake +++ b/projects/rocprofiler-sdk/cmake/rocprofiler_config_interfaces.cmake @@ -301,9 +301,9 @@ target_link_libraries(rocprofiler-otf2 INTERFACE otf2::otf2) # RCCL # # ----------------------------------------------------------------------------------------# + find_package( rccl - REQUIRED CONFIG HINTS ${rocm_version_DIR} @@ -314,4 +314,14 @@ find_package( PATH_SUFFIXES lib/cmake/rccl) -rocprofiler_config_nolink_target(rocprofiler-rccl-nolink rccl::rccl) +if(rccl_FOUND + AND rccl_INCLUDE_DIR + AND EXISTS "${rccl_INCLUDE_DIR}/rccl/amd_detail/api_trace.h") + set(rccl_API_TRACE_FOUND ON) + rocprofiler_config_nolink_target(rocprofiler-rccl-nolink rccl::rccl) +else() + set(rccl_API_TRACE_FOUND OFF) + target_compile_definitions(rocprofiler-rccl-nolink + INTERFACE ROCPROFILER_SDK_USE_SYSTEM_RCCL=0) + +endif() diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/CMakeLists.txt b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/CMakeLists.txt index 0b6915aec7..0289b0f0bb 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/CMakeLists.txt @@ -40,5 +40,6 @@ install( add_subdirectory(hip) add_subdirectory(hsa) add_subdirectory(marker) +add_subdirectory(rccl) add_subdirectory(cxx) add_subdirectory(amd_detail) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/CMakeLists.txt b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/CMakeLists.txt new file mode 100644 index 0000000000..5ac407e7ea --- /dev/null +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/CMakeLists.txt @@ -0,0 +1,13 @@ +# +# +# Installation of public RCCL headers +# +# +set(ROCPROFILER_RCCL_HEADER_FILES api_args.h api_id.h table_id.h) + +install( + FILES ${ROCPROFILER_RCCL_HEADER_FILES} + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk/rccl + COMPONENT development) + +add_subdirectory(details) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/api_args.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/api_args.h index 651303dfdf..73870fdea4 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/api_args.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/api_args.h @@ -25,7 +25,23 @@ #include #include -#include +#if !defined(ROCPROFILER_SDK_USE_SYSTEM_RCCL) +# if defined __has_include +# if __has_include() +# define ROCPROFILER_SDK_USE_SYSTEM_RCCL 1 +# else +# define ROCPROFILER_SDK_USE_SYSTEM_RCCL 0 +# endif +# else +# define ROCPROFILER_SDK_USE_SYSTEM_RCCL 0 +# endif +#endif + +#if ROCPROFILER_SDK_USE_SYSTEM_RCCL > 0 +# include +#else +# include +#endif #include diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/CMakeLists.txt b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/CMakeLists.txt new file mode 100644 index 0000000000..68deec2751 --- /dev/null +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/CMakeLists.txt @@ -0,0 +1,11 @@ +# +# +# Installation of public RCCL headers +# +# +set(ROCPROFILER_RCCL_DETAILS_HEADER_FILES api_trace.h rccl.h) + +install( + FILES ${ROCPROFILER_RCCL_DETAILS_HEADER_FILES} + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk/rccl/details + COMPONENT development) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/details/api_trace.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/api_trace.h similarity index 96% rename from projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/details/api_trace.h rename to projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/api_trace.h index a594a29227..1fa2aeda44 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/details/api_trace.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/api_trace.h @@ -22,7 +22,23 @@ #pragma once -#include +#if !defined(ROCPROFILER_SDK_USE_SYSTEM_RCCL) +# if defined __has_include +# if __has_include() +# define ROCPROFILER_SDK_USE_SYSTEM_RCCL 1 +# else +# define ROCPROFILER_SDK_USE_SYSTEM_RCCL 0 +# endif +# else +# define ROCPROFILER_SDK_USE_SYSTEM_RCCL 0 +# endif +#endif + +#if ROCPROFILER_SDK_USE_SYSTEM_RCCL > 0 +# include +#else +# include +#endif #include #include diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/rccl.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/rccl.h new file mode 100644 index 0000000000..6866824e2c --- /dev/null +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/rccl/details/rccl.h @@ -0,0 +1,1081 @@ +/************************************************************************* + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_H_ +#define NCCL_H_ + +#include +#include + +#define NCCL_MAJOR 2 +#define NCCL_MINOR 20 +#define NCCL_PATCH 5 +#define NCCL_SUFFIX "" + +#define NCCL_VERSION_CODE 22005 +#define NCCL_VERSION(X, Y, Z) \ + (((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z)) + +#define RCCL_BFLOAT16 1 +#define RCCL_FLOAT8 1 +#define RCCL_GATHER_SCATTER 1 +#define RCCL_ALLTOALLV 1 + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/*! @brief Opaque handle to communicator + @details A communicator contains information required to facilitate collective communications + calls */ +typedef struct ncclComm* ncclComm_t; +#define NCCL_COMM_NULL NULL + +#define NCCL_UNIQUE_ID_BYTES 128 +/*! @brief Opaque unique id used to initialize communicators + @details The ncclUniqueId must be passed to all participating ranks */ +typedef struct +{ + char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/ +} ncclUniqueId; + +/*! @defgroup rccl_result_code Result Codes + @details The various result codes that RCCL API calls may return + @{ */ + +/*! @brief Result type + @details Return codes aside from ncclSuccess indicate that a call has failed */ +typedef enum +{ + ncclSuccess = 0, /*!< No error */ + ncclUnhandledCudaError = 1, /*!< Unhandled HIP error */ + ncclSystemError = 2, /*!< Unhandled system error */ + ncclInternalError = 3, /*!< Internal Error - Please report to RCCL developers */ + ncclInvalidArgument = 4, /*!< Invalid argument */ + ncclInvalidUsage = 5, /*!< Invalid usage */ + ncclRemoteError = 6, /*!< Remote process exited or there was a network error */ + ncclInProgress = 7, /*!< RCCL operation in progress */ + ncclNumResults = 8 /*!< Number of result types */ +} ncclResult_t; +/*! @} */ + +#define NCCL_CONFIG_UNDEF_INT INT_MIN +#define NCCL_CONFIG_UNDEF_PTR NULL +#define NCCL_SPLIT_NOCOLOR -1 + +/*! @defgroup rccl_config_type Communicator Configuration + @details Structure that allows for customizing Communicator behavior via + ncclCommInitRankConfig + @{ */ + +/*! @brief Communicator configuration + @details Users can assign value to attributes to specify the behavior of a communicator */ +typedef struct ncclConfig_v21700 +{ + /* attributes that users should never touch. */ + size_t size; /*!< Should not be touched */ + unsigned int magic; /*!< Should not be touched */ + unsigned int version; /*!< Should not be touched */ + /* attributes that users are able to customize. */ + int blocking; /*!< Whether or not calls should block or not */ + int cgaClusterSize; /*!< Cooperative group array cluster size */ + int minCTAs; /*!< Minimum number of cooperative thread arrays (blocks) */ + int maxCTAs; /*!< Maximum number of cooperative thread arrays (blocks) */ + const char* netName; /*!< Force NCCL to use a specfic network */ + int splitShare; /*!< Allow communicators to share resources */ +} ncclConfig_t; + +/* Config initializer must be assigned to initialize config structure when it is created. + * Not initialized config will result in an error. */ +#define NCCL_CONFIG_INITIALIZER \ + { \ + sizeof(ncclConfig_t), /* size */ \ + 0xcafebeef, /* magic */ \ + NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ + NCCL_CONFIG_UNDEF_INT, /* blocking */ \ + NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ + NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ + NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ + NCCL_CONFIG_UNDEF_PTR, /* netName */ \ + NCCL_CONFIG_UNDEF_INT /* splitShare */ \ + } +/*! @} */ + +/* NCCL malloc and free function for all types of NCCL optimizations + * (e.g. user buffer registration). The actual allocated size might + * be larger than requested due to granularity requirement. */ +ncclResult_t +ncclMemAlloc(void** ptr, size_t size); +ncclResult_t +pncclMemAlloc(void** ptr, size_t size); + +ncclResult_t +ncclMemFree(void* ptr); +ncclResult_t +pncclMemFree(void* ptr); + +/*! @defgroup rccl_api_version Version Information + @details API call that returns RCCL version + @{ */ + +/*! @brief Return the RCCL_VERSION_CODE of RCCL in the supplied integer. + @details This integer is coded with the MAJOR, MINOR and PATCH level of RCCL. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] version Pointer to where version will be stored */ + +ncclResult_t +ncclGetVersion(int* version); +/*! @cond include_hidden */ +ncclResult_t +pncclGetVersion(int* version); +/*! @endcond */ +/*! @} */ + +/*! @defgroup rccl_api_communicator Communicator Initialization/Destruction + @details API calls that operate on communicators. + Communicators objects are used to launch collective communication + operations. Unique ranks between 0 and N-1 must be assigned to + each HIP device participating in the same Communicator. + Using the same HIP device for multiple ranks of the same Communicator + is not supported at this time. + @{ */ + +/*! @brief Generates an ID for ncclCommInitRank. + @details Generates an ID to be used in ncclCommInitRank. + ncclGetUniqueId should be called once by a single rank and the + ID should be distributed to all ranks in the communicator before + using it as a parameter for ncclCommInitRank. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] uniqueId Pointer to where uniqueId will be stored */ +ncclResult_t +ncclGetUniqueId(ncclUniqueId* uniqueId); +/*! @cond include_hidden */ +ncclResult_t +pncclGetUniqueId(ncclUniqueId* uniqueId); +/*! @endcond */ + +/*! @brief Create a new communicator with config. + @details Create a new communicator (multi thread/process version) with a configuration + set by users. See @ref rccl_config_type for more details. + Each rank is associated to a CUDA device, which has to be set before calling + ncclCommInitRank. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] comm Pointer to created communicator + @param[in] nranks Total number of ranks participating in this communicator + @param[in] commId UniqueId required for initialization + @param[in] rank Current rank to create communicator for. [0 to nranks-1] + @param[in] config Pointer to communicator configuration */ +ncclResult_t +ncclCommInitRankConfig(ncclComm_t* comm, + int nranks, + ncclUniqueId commId, + int rank, + ncclConfig_t* config); +/*! @cond include_hidden */ +ncclResult_t +pncclCommInitRankConfig(ncclComm_t* comm, + int nranks, + ncclUniqueId commId, + int rank, + ncclConfig_t* config); +/*! @endcond */ + +/*! @brief Creates a new communicator (multi thread/process version). + @details Rank must be between 0 and nranks-1 and unique within a communicator clique. + Each rank is associated to a CUDA device, which has to be set before calling + ncclCommInitRank. ncclCommInitRank implicitly syncronizes with other ranks, + so it must be called by different threads/processes or use + ncclGroupStart/ncclGroupEnd. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] comm Pointer to created communicator + @param[in] nranks Total number of ranks participating in this communicator + @param[in] commId UniqueId required for initialization + @param[in] rank Current rank to create communicator for */ +ncclResult_t +ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); +/*! @cond include_hidden */ +ncclResult_t +pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); +/*! @endcond */ + +/*! @brief Creates a clique of communicators (single process version). + @details This is a convenience function to create a single-process communicator clique. + Returns an array of ndev newly initialized communicators in comm. + comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). + If devlist is NULL, the first ndev HIP devices are used. + Order of devlist defines user-order of processors within the communicator. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] comm Pointer to array of created communicators + @param[in] ndev Total number of ranks participating in this communicator + @param[in] devlist Array of GPU device indices to create for */ +ncclResult_t +ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); +/*! @cond include_hidden */ +ncclResult_t +pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); +/*! @endcond */ + +/*! @brief Finalize a communicator. + @details ncclCommFinalize flushes all issued communications + and marks communicator state as ncclInProgress. The state will change to ncclSuccess + when the communicator is globally quiescent and related resources are freed; then, + calling ncclCommDestroy can locally free the rest of the resources (e.g. + communicator itself) without blocking. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to finalize */ +ncclResult_t +ncclCommFinalize(ncclComm_t comm); +/*! @cond include_hidden */ +ncclResult_t +pncclCommFinalize(ncclComm_t comm); +/*! @endcond */ + +/*! @brief Frees local resources associated with communicator object. + @details Destroy all local resources associated with the passed in communicator object + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to destroy */ +ncclResult_t +ncclCommDestroy(ncclComm_t comm); +/*! @cond include_hidden */ +ncclResult_t +pncclCommDestroy(ncclComm_t comm); +/*! @endcond */ + +/*! @brief Abort any in-progress calls and destroy the communicator object. + @details Frees resources associated with communicator object and aborts any operations + that might still be running on the device. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to abort and destroy */ +ncclResult_t +ncclCommAbort(ncclComm_t comm); +/*! @cond include_hidden */ +ncclResult_t +pncclCommAbort(ncclComm_t comm); +/*! @endcond */ + +/*! @brief Create one or more communicators from an existing one. + @details Creates one or more communicators from an existing one. + Ranks with the same color will end up in the same communicator. + Within the new communicator, key will be used to order ranks. + NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group + and will therefore return a NULL communicator. + If config is NULL, the new communicator will inherit the original communicator's + configuration + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Original communicator object for this rank + @param[in] color Color to assign this rank + @param[in] key Key used to order ranks within the same new communicator + @param[out] newcomm Pointer to new communicator + @param[in] config Config file for new communicator. May be NULL to inherit from comm */ +ncclResult_t +ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclConfig_t* config); +/*! @cond include_hidden */ +ncclResult_t +pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclConfig_t* config); +/*! @endcond */ +/*! @} */ + +/*! @defgroup rccl_api_errcheck Error Checking Calls + @details API calls that check for errors + @{ */ + +/*! @brief Returns a string for each result code. + @details Returns a human-readable string describing the given result code. + @return String containing description of result code. + + @param[in] result Result code to get description for */ +const char* +ncclGetErrorString(ncclResult_t result); +/*! @cond include_hidden */ +const char* +pncclGetErrorString(ncclResult_t result); +/*! @endcond */ + +/* Returns a human-readable message of the last error that occurred. */ +const char* +ncclGetLastError(ncclComm_t comm); +/*! @cond include_hidden */ +const char* +pncclGetLastError(ncclComm_t comm); +/*! @endcond */ + +/*! @brief Checks whether the comm has encountered any asynchronous errors + @details Query whether the provided communicator has encountered any asynchronous errors + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to query + @param[out] asyncError Pointer to where result code will be stored */ +ncclResult_t +ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); +/*! @cond include_hidden */ +ncclResult_t +pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); +/*! @endcond */ +/*! @} */ + +/*! @defgroup rccl_api_comminfo Communicator Information + @details API calls that query communicator information + @{ */ + +/*! @brief Gets the number of ranks in the communicator clique. + @details Returns the number of ranks in the communicator clique (as set during + initialization) + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to query + @param[out] count Pointer to where number of ranks will be stored */ +ncclResult_t +ncclCommCount(const ncclComm_t comm, int* count); +/*! @cond include_hidden */ +ncclResult_t +pncclCommCount(const ncclComm_t comm, int* count); +/*~ @endcond */ + +/*! @brief Get the ROCm device index associated with a communicator + @details Returns the ROCm device number associated with the provided communicator. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to query + @param[out] device Pointer to where the associated ROCm device index will be stored */ +ncclResult_t +ncclCommCuDevice(const ncclComm_t comm, int* device); +/*! @cond include_hidden */ +ncclResult_t +pncclCommCuDevice(const ncclComm_t comm, int* device); +/*! @endcond */ + +/*! @brief Get the rank associated with a communicator + @details Returns the user-ordered "rank" associated with the provided communicator. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to query + @param[out] rank Pointer to where the associated rank will be stored */ +ncclResult_t +ncclCommUserRank(const ncclComm_t comm, int* rank); +/*! @cond include_hidden */ +ncclResult_t +pncclCommUserRank(const ncclComm_t comm, int* rank); +/*! @endcond */ +/*! @} */ + +/* Register CUDA buffer for zero-copy operation */ +ncclResult_t +ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); +/*! @cond include_hidden */ +ncclResult_t +pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); +/*! @endcond */ + +/* Deregister CUDA buffer */ +ncclResult_t +ncclCommDeregister(const ncclComm_t comm, void* handle); +/*! @cond include_hidden */ +ncclResult_t +pncclCommDeregister(const ncclComm_t comm, void* handle); +/*! @endcond */ + +/*! @defgroup rccl_api_enumerations API Enumerations + @details Enumerations used by collective communication calls + @{ */ + +/*! @brief Dummy reduction enumeration + @details Dummy reduction enumeration used to determine value for ncclMaxRedOp */ +typedef enum +{ + ncclNumOps_dummy = 5 +} ncclRedOp_dummy_t; + +/*! @brief Reduction operation selector + @details Enumeration used to specify the various reduction operations + ncclNumOps is the number of built-in ncclRedOp_t values and serves as + the least possible value for dynamic ncclRedOp_t values constructed by + ncclRedOpCreate functions. + + ncclMaxRedOp is the largest valid value for ncclRedOp_t and is defined + to be the largest signed value (since compilers are permitted to use + signed enums) that won't grow sizeof(ncclRedOp_t) when compared to previous + RCCL versions to maintain ABI compatibility. */ +typedef enum +{ + ncclSum = 0, /*!< Sum */ + ncclProd = 1, /*!< Product */ + ncclMax = 2, /*!< Max */ + ncclMin = 3, /*!< Min */ + ncclAvg = 4, /*!< Average */ + ncclNumOps = 5, /*!< Number of built-in reduction ops */ + ncclMaxRedOp = + 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t)) /*!< Largest value for ncclRedOp_t */ +} ncclRedOp_t; + +/*! @brief Data types + @details Enumeration of the various supported datatype */ +typedef enum +{ + ncclInt8 = 0, + ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, + ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, + ncclHalf = 6, + ncclFloat32 = 7, + ncclFloat = 7, + ncclFloat64 = 8, + ncclDouble = 8, + ncclBfloat16 = 9, +#if defined(RCCL_FLOAT8) + ncclFp8E4M3 = 10, + ncclFp8E5M2 = 11, + ncclNumTypes = 12 +} ncclDataType_t; +#else + ncclNumTypes = 10 +} ncclDataType_t; +#endif +/*! @} */ + +/*! @defgroup rccl_api_custom_redop Custom Reduction Operator + @details API calls relating to creation/destroying custom reduction operator + that pre-multiplies local source arrays prior to reduction + @{ */ + +/*! @brief Location and dereferencing logic for scalar arguments. + @details Enumeration specifying memory location of the scalar argument. + Based on where the value is stored, the argument will be dereferenced either + while the collective is running (if in device memory), or before the + ncclRedOpCreate() function returns (if in host memory). */ +typedef enum +{ + ncclScalarDevice = 0, /*!< Scalar is in device-visible memory */ + ncclScalarHostImmediate = 1 /*!< Scalar is in host-visible memory */ +} ncclScalarResidence_t; + +/*! @brief Create a custom pre-multiplier reduction operator + @details Creates a new reduction operator which pre-multiplies input values by a given + scalar locally before reducing them with peer values via summation. For use + only with collectives launched against *comm* and *datatype*. The + *residence* argument indicates how/when the memory pointed to by *scalar* + will be dereferenced. Upon return, the newly created operator's handle + is stored in *op*. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] op Pointer to where newly created custom reduction operator is to be + stored + @param[in] scalar Pointer to scalar value. + @param[in] datatype Scalar value datatype + @param[in] residence Memory type of the scalar value + @param[in] comm Communicator to associate with this custom reduction operator */ +ncclResult_t +ncclRedOpCreatePreMulSum(ncclRedOp_t* op, + void* scalar, + ncclDataType_t datatype, + ncclScalarResidence_t residence, + ncclComm_t comm); +/*! @cond include_hidden */ +ncclResult_t +pncclRedOpCreatePreMulSum(ncclRedOp_t* op, + void* scalar, + ncclDataType_t datatype, + ncclScalarResidence_t residence, + ncclComm_t comm); +/*! @endcond */ + +/*! @brief Destroy custom reduction operator + @details Destroys the reduction operator *op*. The operator must have been created by + ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be + destroyed as soon as the last RCCL function which is given that operator returns. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] op Custom reduction operator is to be destroyed + @param[in] comm Communicator associated with this reduction operator */ +ncclResult_t +ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); +/*! @cond include_hidden */ +ncclResult_t +pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); +/*! @endcond */ +/*! @} */ + +/*! @defgroup rccl_collective_api Collective Communication Operations + @details Collective communication operations must be called separately for each + communicator in a communicator clique. + + They return when operations have been enqueued on the HIP stream. + Since they may perform inter-CPU synchronization, each call has to be done + from a different thread or process, or need to use Group Semantics (see + below). + @{ */ + +/*! @brief Reduce + @details Reduces data arrays of length *count* in *sendbuff* into *recvbuff* using *op* + operation. + *recvbuff* may be NULL on all calls except for root device. + *root* is the rank (not the HIP device) where data will reside after the + operation is complete. + In-place operation will happen if sendbuff == recvbuff. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Local device data buffer to be reduced + @param[out] recvbuff Data buffer where result is stored (only for *root* rank). May be + null for other ranks. + @param[in] count Number of elements in every send buffer + @param[in] datatype Data buffer element datatype + @param[in] op Reduction operator type + @param[in] root Rank where result data array will be stored + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +ncclReduce(const void* sendbuff, + void* recvbuff, + size_t count, + ncclDataType_t datatype, + ncclRedOp_t op, + int root, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pncclReduce(const void* sendbuff, + void* recvbuff, + size_t count, + ncclDataType_t datatype, + ncclRedOp_t op, + int root, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @brief (Deprecated) Broadcast (in-place) + @details Copies *count* values from *root* to all other devices. + root is the rank (not the CUDA device) where data resides before the + operation is started. + This operation is implicitly in-place. + @return Result code. See @ref rccl_result_code for more details. + + @param[in,out] buff Input array on *root* to be copied to other ranks. Output array for + all ranks. + @param[in] count Number of elements in data buffer + @param[in] datatype Data buffer element datatype + @param[in] root Rank owning buffer to be copied to others + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +ncclBcast(void* buff, + size_t count, + ncclDataType_t datatype, + int root, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pncclBcast(void* buff, + size_t count, + ncclDataType_t datatype, + int root, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @brief Broadcast + @details Copies *count* values from *sendbuff* on *root* to *recvbuff* on all devices. + *root* is the rank (not the HIP device) where data resides before the operation is + started. *sendbuff* may be NULL on ranks other than *root*. In-place operation will happen if + *sendbuff* == *recvbuff*. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Data array to copy (if *root*). May be NULL for other ranks + @param[in] recvbuff Data array to store received array + @param[in] count Number of elements in data buffer + @param[in] datatype Data buffer element datatype + @param[in] root Rank of broadcast root + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +ncclBroadcast(const void* sendbuff, + void* recvbuff, + size_t count, + ncclDataType_t datatype, + int root, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pncclBroadcast(const void* sendbuff, + void* recvbuff, + size_t count, + ncclDataType_t datatype, + int root, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @brief All-Reduce + @details Reduces data arrays of length *count* in *sendbuff* using *op* operation, and + leaves identical copies of result on each *recvbuff*. + In-place operation will happen if sendbuff == recvbuff. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Input data array to reduce + @param[out] recvbuff Data array to store reduced result array + @param[in] count Number of elements in data buffer + @param[in] datatype Data buffer element datatype + @param[in] op Reduction operator + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +ncclAllReduce(const void* sendbuff, + void* recvbuff, + size_t count, + ncclDataType_t datatype, + ncclRedOp_t op, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pncclAllReduce(const void* sendbuff, + void* recvbuff, + size_t count, + ncclDataType_t datatype, + ncclRedOp_t op, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @brief Reduce-Scatter + @details Reduces data in *sendbuff* using *op* operation and leaves reduced result + scattered over the devices so that *recvbuff* on rank i will contain the i-th + block of the result. + Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff* + should have a size of at least nranks*recvcount elements. + In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Input data array to reduce + @param[out] recvbuff Data array to store reduced result subarray + @param[in] recvcount Number of elements each rank receives + @param[in] datatype Data buffer element datatype + @param[in] op Reduction operator + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +ncclReduceScatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + ncclDataType_t datatype, + ncclRedOp_t op, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pncclReduceScatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + ncclDataType_t datatype, + ncclRedOp_t op, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @brief All-Gather + @details Each device gathers *sendcount* values from other GPUs into *recvbuff*, + receiving data from rank i at offset i*sendcount. + Assumes recvcount is equal to nranks*sendcount, which means that recvbuff + should have a size of at least nranks*sendcount elements. + In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Input data array to send + @param[out] recvbuff Data array to store the gathered result + @param[in] sendcount Number of elements each rank sends + @param[in] datatype Data buffer element datatype + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +ncclAllGather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + ncclDataType_t datatype, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pncclAllGather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + ncclDataType_t datatype, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @brief Send + @details Send data from *sendbuff* to rank *peer*. + Rank *peer* needs to call ncclRecv with the same *datatype* and the same *count* + as this rank. + This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + need to progress concurrently to complete, they must be fused within a + ncclGroupStart / ncclGroupEnd section. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Data array to send + @param[in] count Number of elements to send + @param[in] datatype Data buffer element datatype + @param[in] peer Peer rank to send to + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +ncclSend(const void* sendbuff, + size_t count, + ncclDataType_t datatype, + int peer, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pncclSend(const void* sendbuff, + size_t count, + ncclDataType_t datatype, + int peer, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @brief Receive + @details Receive data from rank *peer* into *recvbuff*. + Rank *peer* needs to call ncclSend with the same datatype and the same count + as this rank. + This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + need to progress concurrently to complete, they must be fused within a + ncclGroupStart/ ncclGroupEnd section. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] recvbuff Data array to receive + @param[in] count Number of elements to receive + @param[in] datatype Data buffer element datatype + @param[in] peer Peer rank to send to + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +ncclRecv(void* recvbuff, + size_t count, + ncclDataType_t datatype, + int peer, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pncclRecv(void* recvbuff, + size_t count, + ncclDataType_t datatype, + int peer, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @brief Gather + @details Root device gathers *sendcount* values from other GPUs into *recvbuff*, + receiving data from rank i at offset i*sendcount. + Assumes recvcount is equal to nranks*sendcount, which means that *recvbuff* + should have a size of at least nranks*sendcount elements. + In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + *recvbuff* may be NULL on ranks other than *root*. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Data array to send + @param[out] recvbuff Data array to receive into on *root*. + @param[in] sendcount Number of elements to send per rank + @param[in] datatype Data buffer element datatype + @param[in] root Rank that receives data from all other ranks + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +ncclGather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + ncclDataType_t datatype, + int root, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pncclGather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + ncclDataType_t datatype, + int root, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @brief Scatter + @details Scattered over the devices so that recvbuff on rank i will contain the i-th + block of the data on root. + Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff* + should have a size of at least nranks*recvcount elements. + In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Data array to send (on *root* rank). May be NULL on other ranks. + @param[out] recvbuff Data array to receive partial subarray into + @param[in] recvcount Number of elements to receive per rank + @param[in] datatype Data buffer element datatype + @param[in] root Rank that scatters data to all other ranks + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +ncclScatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + ncclDataType_t datatype, + int root, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pncclScatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + ncclDataType_t datatype, + int root, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @brief All-To-All + @details Device (i) send (j)th block of data to device (j) and be placed as (i)th + block. Each block for sending/receiving has *count* elements, which means + that *recvbuff* and *sendbuff* should have a size of nranks*count elements. + In-place operation is NOT supported. It is the user's responsibility + to ensure that sendbuff and recvbuff are distinct. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Data array to send (contains blocks for each other rank) + @param[out] recvbuff Data array to receive (contains blocks from each other rank) + @param[in] count Number of elements to send between each pair of ranks + @param[in] datatype Data buffer element datatype + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +ncclAllToAll(const void* sendbuff, + void* recvbuff, + size_t count, + ncclDataType_t datatype, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pncclAllToAll(const void* sendbuff, + void* recvbuff, + size_t count, + ncclDataType_t datatype, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @brief All-To-Allv + @details Device (i) sends sendcounts[j] of data from offset sdispls[j] + to device (j). At the same time, device (i) receives recvcounts[j] of data + from device (j) to be placed at rdispls[j]. + sendcounts, sdispls, recvcounts and rdispls are all measured in the units + of datatype, not bytes. + In-place operation will happen if sendbuff == recvbuff. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Data array to send (contains blocks for each other rank) + @param[in] sendcounts Array containing number of elements to send to each participating rank + @param[in] sdispls Array of offsets into *sendbuff* for each participating rank + @param[out] recvbuff Data array to receive (contains blocks from each other rank) + @param[in] recvcounts Array containing number of elements to receive from each participating + rank + @param[in] rdispls Array of offsets into *recvbuff* for each participating rank + @param[in] datatype Data buffer element datatype + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +ncclAllToAllv(const void* sendbuff, + const size_t sendcounts[], + const size_t sdispls[], + void* recvbuff, + const size_t recvcounts[], + const size_t rdispls[], + ncclDataType_t datatype, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pncclAllToAllv(const void* sendbuff, + const size_t sendcounts[], + const size_t sdispls[], + void* recvbuff, + const size_t recvcounts[], + const size_t rdispls[], + ncclDataType_t datatype, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @} */ + +/*! @defgroup msccl_api MSCCL Algorithm + @details API calls relating to the optional MSCCL algorithm datapath + @{ */ + +/*! @brief Opaque handle to MSCCL algorithm */ +typedef int mscclAlgoHandle_t; + +/*! @brief MSCCL Load Algorithm + @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return + its handle via mscclAlgoHandle. This API is expected to be called by MSCCL + scheduler instead of end users. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] mscclAlgoFilePath Path to MSCCL algorithm file + @param[out] mscclAlgoHandle Returned handle to MSCCL algorithm + @param[in] rank Current rank */ +ncclResult_t +mscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank); +/*! @cond include_hidden */ +ncclResult_t +pmscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank); +/*! @endcond */ + +/*! @brief MSCCL Run Algorithm + @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter + list merges all possible parameters required by different operations as this + is a general-purposed API. This API is expected to be called by MSCCL + scheduler instead of end users. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendBuff Data array to send + @param[in] sendCounts Array containing number of elements to send to each participating + rank + @param[in] sDisPls Array of offsets into *sendbuff* for each participating rank + @param[out] recvBuff Data array to receive + @param[in] recvCounts Array containing number of elements to receive from each + participating rank + @param[in] rDisPls Array of offsets into *recvbuff* for each participating rank + @param[in] count Number of elements + @param[in] dataType Data buffer element datatype + @param[in] root Root rank index + @param[in] peer Peer rank index + @param[in] op Reduction operator + @param[in] mscclAlgoHandle Handle to MSCCL algorithm + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ +ncclResult_t +mscclRunAlgo(const void* sendBuff, + const size_t sendCounts[], + const size_t sDisPls[], + void* recvBuff, + const size_t recvCounts[], + const size_t rDisPls[], + size_t count, + ncclDataType_t dataType, + int root, + int peer, + ncclRedOp_t op, + mscclAlgoHandle_t mscclAlgoHandle, + ncclComm_t comm, + hipStream_t stream); +/*! @cond include_hidden */ +ncclResult_t +pmscclRunAlgo(const void* sendBuff, + const size_t sendCounts[], + const size_t sDisPls[], + void* recvBuff, + const size_t recvCounts[], + const size_t rDisPls[], + size_t count, + ncclDataType_t dataType, + int root, + int peer, + ncclRedOp_t op, + mscclAlgoHandle_t mscclAlgoHandle, + ncclComm_t comm, + hipStream_t stream); +/*! @endcond */ + +/*! @brief MSCCL Unload Algorithm + @deprecated This function has been removed from the public API. + @details Unload MSCCL algorithm previous loaded using its handle. This API + is expected to be called by MSCCL scheduler instead of end users. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] mscclAlgoHandle Handle to MSCCL algorithm to unload +*/ +ncclResult_t +mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); +/*! @cond include_hidden */ +ncclResult_t +pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); +/*! @endcond */ +/*! @} */ + +/*! @defgroup rccl_group_api Group semantics + @details When managing multiple GPUs from a single thread, and since RCCL collective + calls may perform inter-CPU synchronization, we need to "group" calls for + different ranks/devices into a single call. + + Grouping RCCL calls as being part of the same collective operation is done + using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all + collective calls until the ncclGroupEnd call, which will wait for all calls + to be complete. Note that for collective communication, ncclGroupEnd only + guarantees that the operations are enqueued on the streams, not that + the operation is effectively done. + + Both collective communication and ncclCommInitRank can be used in conjunction + of ncclGroupStart/ncclGroupEnd, but not together. + + Group semantics also allow to fuse multiple operations on the same device + to improve performance (for aggregated collective calls), or to permit + concurrent progress of multiple send/receive operations. + @{ */ + +/*! @brief Group Start + @details Start a group call. All calls to RCCL until ncclGroupEnd will be fused into + a single RCCL operation. Nothing will be started on the HIP stream until + ncclGroupEnd. + @return Result code. See @ref rccl_result_code for more details. */ +ncclResult_t +ncclGroupStart(); +/*! @cond include_hidden */ +ncclResult_t +pncclGroupStart(); +/*! @endcond */ + +/*! @brief Group End + @details End a group call. Start a fused RCCL operation consisting of all calls since + ncclGroupStart. Operations on the HIP stream depending on the RCCL operations + need to be called after ncclGroupEnd. + @return Result code. See @ref rccl_result_code for more details. */ +ncclResult_t +ncclGroupEnd(); +/*! @cond include_hidden */ +ncclResult_t +pncclGroupEnd(); +/*! @endcond */ +/*! @} */ + +#ifdef __cplusplus +} // end extern "C" +#endif + +#endif // end include guard diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp index 5a2b617d67..1524da61b2 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -1444,15 +1444,13 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) &get_buffers().rccl_api_trace), "buffer creation"); - auto _status = + ROCPROFILER_CALL( rocprofiler_configure_buffer_tracing_service(get_client_ctx(), ROCPROFILER_BUFFER_TRACING_RCCL_API, nullptr, 0, - get_buffers().rccl_api_trace); - - if(_status != ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED) - ROCPROFILER_CALL(_status, "buffer tracing service for rccl api configure"); + get_buffers().rccl_api_trace), + "buffer tracing service for rccl api configure"); } if(tool::get_config().counter_collection) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp index de7d17534b..aa51416ecf 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp @@ -101,10 +101,10 @@ get_unsupported() { auto unsupported = std::unordered_set{}; -#if ROCPROFILER_SDK_RCCL_HAS_API_TRACE == 0 - // Built against RCCL which does not support API tracing - unsupported.emplace(ROCPROFILER_BUFFER_TRACING_RCCL_API); -#endif + // #if ROCPROFILER_SDK_USE_SYSTEM_RCCL == 0 + // // Built against RCCL which does not support API tracing + // unsupported.emplace(ROCPROFILER_BUFFER_TRACING_RCCL_API); + // #endif return unsupported; } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/callback_tracing.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/callback_tracing.cpp index 02246191d1..4d5e3b6137 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/callback_tracing.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/callback_tracing.cpp @@ -98,10 +98,10 @@ get_unsupported() { auto unsupported = std::unordered_set{}; -#if ROCPROFILER_SDK_RCCL_HAS_API_TRACE == 0 - // Built against RCCL which does not support API tracing - unsupported.emplace(ROCPROFILER_CALLBACK_TRACING_RCCL_API); -#endif + // #if ROCPROFILER_SDK_USE_SYSTEM_RCCL == 0 + // // Built against RCCL which does not support API tracing + // unsupported.emplace(ROCPROFILER_CALLBACK_TRACING_RCCL_API); + // #endif return unsupported; } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/CMakeLists.txt index 89b8aedf31..0d5db03bab 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/CMakeLists.txt @@ -3,5 +3,3 @@ set(ROCPROFILER_LIB_RCCL_HEADERS defines.hpp rccl.hpp) target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_RCCL_SOURCES} ${ROCPROFILER_LIB_RCCL_HEADERS}) - -add_subdirectory(details) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/abi.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/abi.cpp index a834cd9259..7b6a57e340 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/abi.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/abi.cpp @@ -22,13 +22,11 @@ #include "lib/rocprofiler-sdk/rccl/rccl.hpp" -#if ROCPROFILER_SDK_RCCL_HAS_API_TRACE > 0 +#include "lib/common/abi.hpp" +#include "lib/common/defines.hpp" -# include "lib/common/abi.hpp" -# include "lib/common/defines.hpp" - -# include -# include +#include +#include namespace rocprofiler { @@ -37,46 +35,44 @@ namespace rccl static_assert(RCCL_API_TRACE_VERSION_MAJOR == 0, "Major version updated for RCCL dispatch table"); static_assert(RCCL_API_TRACE_VERSION_PATCH == 0, "Patch version updated for RCCL dispatch table"); -ROCP_SDK_ENFORCE_ABI_VERSIONING(rcclApiFuncTable, 37) +ROCP_SDK_ENFORCE_ABI_VERSIONING(::rcclApiFuncTable, 37) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclAllGather_fn, 0) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclAllReduce_fn, 1) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclAllToAll_fn, 2) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclAllToAllv_fn, 3) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclBroadcast_fn, 4) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclGather_fn, 5) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclReduce_fn, 6) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclReduceScatter_fn, 7) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclScatter_fn, 8) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclSend_fn, 9) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclRecv_fn, 10) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclRedOpCreatePreMulSum_fn, 11) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclRedOpDestroy_fn, 12) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclGroupStart_fn, 13) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclGroupEnd_fn, 14) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclGetVersion_fn, 15) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclGetUniqueId_fn, 16) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommInitRank_fn, 17) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommInitAll_fn, 18) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommInitRankConfig_fn, 19) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommFinalize_fn, 20) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommDestroy_fn, 21) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommAbort_fn, 22) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommSplit_fn, 23) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclGetErrorString_fn, 24) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclGetLastError_fn, 25) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommGetAsyncError_fn, 26) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommCount_fn, 27) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommCuDevice_fn, 28) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommUserRank_fn, 29) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclMemAlloc_fn, 30) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclMemFree_fn, 31) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, mscclLoadAlgo_fn, 32) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, mscclRunAlgo_fn, 33) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, mscclUnloadAlgo_fn, 34) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommRegister_fn, 35) -ROCP_SDK_ENFORCE_ABI(rcclApiFuncTable, ncclCommDeregister_fn, 36) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclAllGather_fn, 0) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclAllReduce_fn, 1) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclAllToAll_fn, 2) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclAllToAllv_fn, 3) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclBroadcast_fn, 4) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclGather_fn, 5) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclReduce_fn, 6) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclReduceScatter_fn, 7) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclScatter_fn, 8) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclSend_fn, 9) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclRecv_fn, 10) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclRedOpCreatePreMulSum_fn, 11) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclRedOpDestroy_fn, 12) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclGroupStart_fn, 13) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclGroupEnd_fn, 14) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclGetVersion_fn, 15) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclGetUniqueId_fn, 16) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommInitRank_fn, 17) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommInitAll_fn, 18) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommInitRankConfig_fn, 19) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommFinalize_fn, 20) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommDestroy_fn, 21) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommAbort_fn, 22) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommSplit_fn, 23) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclGetErrorString_fn, 24) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclGetLastError_fn, 25) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommGetAsyncError_fn, 26) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommCount_fn, 27) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommCuDevice_fn, 28) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommUserRank_fn, 29) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclMemAlloc_fn, 30) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclMemFree_fn, 31) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, mscclLoadAlgo_fn, 32) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, mscclRunAlgo_fn, 33) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, mscclUnloadAlgo_fn, 34) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommRegister_fn, 35) +ROCP_SDK_ENFORCE_ABI(::rcclApiFuncTable, ncclCommDeregister_fn, 36) } // namespace rccl } // namespace rocprofiler - -#endif diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/details/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/details/CMakeLists.txt deleted file mode 100644 index d2f12fefc4..0000000000 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/details/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -# -# -# -set(ROCPROFILER_LIB_RCCL_DETAILS_SOURCES) -set(ROCPROFILER_LIB_RCCL_DETAILS_HEADERS api_trace.h) - -target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_RCCL_DETAILS_SOURCES} - ${ROCPROFILER_LIB_RCCL_DETAILS_HEADERS}) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/rccl.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/rccl.hpp index 3c3eeeb75f..ab952b7707 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/rccl.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rccl/rccl.hpp @@ -22,28 +22,28 @@ #pragma once -#if !defined(ROCPROFILER_SDK_RCCL_HAS_API_TRACE) +#if !defined(ROCPROFILER_SDK_USE_SYSTEM_RCCL) # if defined __has_include # if __has_include() -# define ROCPROFILER_SDK_RCCL_HAS_API_TRACE 1 +# define ROCPROFILER_SDK_USE_SYSTEM_RCCL 1 # else -# define ROCPROFILER_SDK_RCCL_HAS_API_TRACE 0 +# define ROCPROFILER_SDK_USE_SYSTEM_RCCL 0 # endif # else -# define ROCPROFILER_SDK_RCCL_HAS_API_TRACE 0 +# define ROCPROFILER_SDK_USE_SYSTEM_RCCL 0 # endif #endif -#if ROCPROFILER_SDK_RCCL_HAS_API_TRACE > 0 +#if ROCPROFILER_SDK_USE_SYSTEM_RCCL > 0 # include +# include #else -# include "lib/rocprofiler-sdk/rccl/details/api_trace.h" +# include +# include #endif #include -#include - #include #include diff --git a/projects/rocprofiler-sdk/tests/tools/json-tool.cpp b/projects/rocprofiler-sdk/tests/tools/json-tool.cpp index a3d94dfda4..781d2bbcea 100644 --- a/projects/rocprofiler-sdk/tests/tools/json-tool.cpp +++ b/projects/rocprofiler-sdk/tests/tools/json-tool.cpp @@ -1101,17 +1101,14 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) nullptr), "scratch memory tracing service configure"); - { - auto _status = - rocprofiler_configure_callback_tracing_service(rccl_api_callback_ctx, - ROCPROFILER_CALLBACK_TRACING_RCCL_API, - nullptr, - 0, - tool_tracing_callback, - nullptr); - if(_status != ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED) - ROCPROFILER_CALL(_status, "rccl api callback tracing service configure"); - } + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(rccl_api_callback_ctx, + ROCPROFILER_CALLBACK_TRACING_RCCL_API, + nullptr, + 0, + tool_tracing_callback, + nullptr), + "rccl api callback tracing service configure"); constexpr auto buffer_size = 8192; constexpr auto watermark = 7936; @@ -1297,16 +1294,13 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) corr_id_retire_buffer), "buffer tracing service for memory copy configure"); - { - auto _status = - rocprofiler_configure_buffer_tracing_service(rccl_api_buffered_ctx, - ROCPROFILER_BUFFER_TRACING_RCCL_API, - nullptr, - 0, - rccl_api_buffered_buffer); - if(_status != ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED) - ROCPROFILER_CALL(_status, "buffer tracing service configure"); - } + ROCPROFILER_CALL( + rocprofiler_configure_buffer_tracing_service(rccl_api_buffered_ctx, + ROCPROFILER_BUFFER_TRACING_RCCL_API, + nullptr, + 0, + rccl_api_buffered_buffer), + "buffer tracing service for rccl api configure"); ROCPROFILER_CALL( rocprofiler_configure_buffered_dispatch_profile_counting_service(