From db840f024e1c771d3a5aea957eeec93c9b9887d0 Mon Sep 17 00:00:00 2001 From: mberenjk <146776561+mberenjk@users.noreply.github.com> Date: Thu, 22 Aug 2024 12:36:07 -0500 Subject: [PATCH] adding all nccl apis to api_support to enable rccl tracing by rocprofv3 (#1297) * adding all nccl apis to api_support to enable rccl tracing by rocprofv3 Co-authored-by: Marzieh Berenjkoub Co-authored-by: Jonathan R. Madsen --- CMakeLists.txt | 20 ++ src/collectives.cc | 40 ++- src/enqueue.cc | 5 +- src/group.cc | 5 +- src/include/api_trace.h | 200 ++++++++++++ src/init.cc | 34 +- src/misc/api_trace.c | 9 + src/misc/api_trace.cc | 656 +++++++++++++++++++++++++++++++++++++++ src/msccl.cc | 7 +- src/register.cc | 5 +- test/common/PtrUnion.cpp | 2 +- 11 files changed, 945 insertions(+), 38 deletions(-) create mode 100644 src/include/api_trace.h create mode 100644 src/misc/api_trace.c create mode 100644 src/misc/api_trace.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index b3d770994e..ccbbc15a1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -386,6 +386,7 @@ set(SRC_FILES src/include/align.h src/include/alloc.h src/include/archinfo.h + src/include/api_trace.h src/include/argcheck.h src/include/BfdBacktrace.hpp src/include/bootstrap.h @@ -473,6 +474,8 @@ set(SRC_FILES src/misc/alt_rsmi.cc src/misc/archinfo.cc src/misc/argcheck.cc + src/misc/api_trace.c + src/misc/api_trace.cc # src/misc/cudawrap.cc # src/misc/gdrwrap.cc src/misc/ibvsymbols.cc @@ -691,6 +694,21 @@ endif() ## Set RCCL linked library directories target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR}) +if (ROCM_VERSION VERSION_GREATER_EQUAL "60100") + option(RCCL_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON) +else() + if(RCCL_ROCPROFILER_REGISTER) + message(AUTHOR_WARNING "RCCL_ROCPROFILER_REGISTER is not valid option for ROCm < 6.2. Current ROCm version: ${ROCM_VERSION}") + endif() + set(RCCL_ROCPROFILER_REGISTER OFF CACHE BOOL "" FORCE) +endif() +if(RCCL_ROCPROFILER_REGISTER) + find_package(rocprofiler-register REQUIRED) + target_compile_definitions(rccl PRIVATE RCCL_ROCPROFILER_REGISTER=1) + target_link_libraries( + rccl PRIVATE rocprofiler-register::rocprofiler-register) +endif() + ## Set RCCL linked libraries if (HAVE_BFD) target_link_libraries(rccl PRIVATE bfd) @@ -766,6 +784,8 @@ rocm_set_soversion(rccl "1.0") rocm_install_targets(TARGETS rccl) rocm_install(FILES ${PROJECT_BINARY_DIR}/include/rccl/rccl.h src/include/nccl_net.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl) +rocm_install(FILES src/include/api_trace.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl/amd_detail) file(COPY tools/msccl-algorithms DESTINATION ${PROJECT_BINARY_DIR}) file(COPY tools/msccl-unit-test-algorithms DESTINATION ${PROJECT_BINARY_DIR}) ## Install Algorithm files under share folder diff --git a/src/collectives.cc b/src/collectives.cc index 896feaac51..89d683a45d 100644 --- a/src/collectives.cc +++ b/src/collectives.cc @@ -9,12 +9,14 @@ #include "enqueue.h" #include "graph/topo.h" #include "nccl.h" +#include "api_trace.h" #include "msccl/msccl_lifecycle.h" NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + +ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { // Just pass the size of one message and not the total bytes sent/received. constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = { @@ -37,7 +39,9 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); -ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + + +ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { struct NvtxParamsAllReduce { size_t bytes; @@ -66,7 +70,9 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); -ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + + +ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) { // Just pass the size of one message and not the total bytes sent/received. constexpr nvtxPayloadSchemaEntry_t AllToAllSchema[] = { @@ -107,7 +113,9 @@ ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, nc NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[], void *recvbuff, const size_t recvcounts[], const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); -ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[], + + +ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[], void *recvbuff, const size_t recvcounts[], const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) { struct NvtxParamsAllToAllv { @@ -153,7 +161,8 @@ ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], cons NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + +ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream) { struct NvtxParamsBroadcast { size_t bytes; @@ -187,7 +196,8 @@ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int ro NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); -ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, + +ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) { struct NvtxParamsGather { size_t bytes; @@ -224,7 +234,8 @@ ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, + +ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { struct NvtxParamsReduce { size_t bytes; @@ -254,7 +265,9 @@ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); -ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, + + +ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { struct NvtxParamsReduceScatter { size_t bytes; @@ -282,7 +295,9 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); -ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root, + + +ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) { struct NvtxParamsScatter { size_t bytes; @@ -328,7 +343,9 @@ constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = { NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + + +ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload) @@ -351,7 +368,8 @@ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatyp NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + +ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload) diff --git a/src/enqueue.cc b/src/enqueue.cc index f7ac3d642c..28419c959f 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -20,6 +20,7 @@ #include "rccl_vars.h" #include "transport.h" #include "common.h" +#include "api_trace.h" #include #include // std::memcpy #include // PRIx64 @@ -2170,7 +2171,7 @@ fail: } NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); -ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) { +ncclResult_t ncclRedOpCreatePreMulSum_impl(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) { NCCLCHECK(PtrCheck(comm, "ncclRedOpCreatePreMulSum", "comm")); /* join init thread before creating PreMulSum op. */ NCCLCHECK(ncclCommEnsureReady(comm)); @@ -2209,7 +2210,7 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp } NCCL_API(ncclResult_t, ncclRedOpDestroy, ncclRedOp_t op, ncclComm_t comm); -ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) { +ncclResult_t ncclRedOpDestroy_impl(ncclRedOp_t op, ncclComm_t comm) { if (0 <= int(op) && int(op) < int(ncclNumOps)) { WARN("ncclRedOpDestroy : operator is a NCCL builtin."); return ncclInvalidArgument; diff --git a/src/group.cc b/src/group.cc index 6200aab189..b724a50fab 100644 --- a/src/group.cc +++ b/src/group.cc @@ -11,6 +11,7 @@ #include "enqueue.h" #include "transport.h" #include "channel.h" +#include "api_trace.h" #include #include "msccl/msccl_lifecycle.h" @@ -83,7 +84,7 @@ ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) { } NCCL_API(ncclResult_t, ncclGroupStart); -ncclResult_t ncclGroupStart() { +ncclResult_t ncclGroupStart_impl() { ncclResult_t ret = ncclSuccess; NVTX3_FUNC_RANGE_IN(nccl_domain); @@ -101,7 +102,7 @@ ncclResult_t ncclGroupStartInternal() { } NCCL_API(ncclResult_t, ncclGroupEnd); -ncclResult_t ncclGroupEnd() { +ncclResult_t ncclGroupEnd_impl() { ncclResult_t ret = ncclSuccess; NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECKGOTO(ncclGroupEndInternal(), ret, exit); diff --git a/src/include/api_trace.h b/src/include/api_trace.h new file mode 100644 index 0000000000..7a1b3588ab --- /dev/null +++ b/src/include/api_trace.h @@ -0,0 +1,200 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "nccl.h" + +#include +#include + +// should only be increased if fundamental changes to dispatch table(s) +#define RCCL_API_TRACE_VERSION_MAJOR 0 + +// should be increased every time new members are added to existing dispatch tables +#define RCCL_API_TRACE_VERSION_PATCH 0 + +#if !defined(RCCL_EXTERN_C_INIT) +# ifdef __cplusplus +# define RCCL_EXTERN_C_INIT \ + extern "C" \ + { +# else +# define RCCL_EXTERN_C_INIT +# endif +#endif + +#if !defined(RCCL_EXTERN_C_FINI) +# ifdef __cplusplus +# define RCCL_EXTERN_C_FINI } +# else +# define RCCL_EXTERN_C_FINI +# endif +#endif + +RCCL_EXTERN_C_INIT + +typedef uint64_t rccl_range_id_t; +typedef ncclResult_t (*ncclAllGather_fn_t)(const void* sendbuff, void* recvbuff, + size_t sendcount, ncclDataType_t datatype, + ncclComm_t comm, hipStream_t stream); +typedef ncclResult_t (*ncclAllReduce_fn_t)(const void* sendbuff, void* recvbuff, + size_t count, ncclDataType_t datatype, + ncclRedOp_t op, struct ncclComm* comm, + cudaStream_t stream); +typedef ncclResult_t (*ncclAllToAll_fn_t)(const void* sendbuff, void* recvbuff, + size_t count, ncclDataType_t datatype, + ncclComm_t comm, hipStream_t stream); +typedef ncclResult_t (*ncclAllToAllv_fn_t)( + const void* sendbuff, const size_t sendcounts[], const size_t sdispls[], + void* recvbuff, const size_t recvcounts[], const size_t rdispls[], + ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); +typedef ncclResult_t (*ncclBroadcast_fn_t)(const void* sendbuff, void* recvbuff, + size_t count, ncclDataType_t datatype, + int root, ncclComm_t comm, + cudaStream_t stream); +typedef ncclResult_t (*ncclGather_fn_t)(const void* sendbuff, void* recvbuff, + size_t sendcount, ncclDataType_t datatype, + int root, ncclComm_t comm, hipStream_t stream); +typedef ncclResult_t (*ncclReduce_fn_t)(const void* sendbuff, void* recvbuff, + size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, + cudaStream_t stream); +typedef ncclResult_t (*ncclReduceScatter_fn_t)(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, + ncclRedOp_t op, struct ncclComm* comm, + cudaStream_t stream); +typedef ncclResult_t (*ncclScatter_fn_t)(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, + int root, ncclComm_t comm, hipStream_t stream); +typedef ncclResult_t (*ncclSend_fn_t)(const void* sendbuff, size_t count, + ncclDataType_t datatype, int peer, ncclComm_t comm, + cudaStream_t stream); +typedef ncclResult_t (*ncclRecv_fn_t)(void* recvbuff, size_t count, + ncclDataType_t datatype, int peer, ncclComm_t comm, + cudaStream_t stream); +typedef ncclResult_t (*ncclRedOpCreatePreMulSum_fn_t)(ncclRedOp_t* op, void* scalar, + ncclDataType_t datatype, + ncclScalarResidence_t residence, + ncclComm_t comm); +typedef ncclResult_t (*ncclRedOpDestroy_fn_t)(ncclRedOp_t op, ncclComm_t comm); +typedef ncclResult_t (*ncclGroupStart_fn_t)(); +typedef ncclResult_t (*ncclGroupEnd_fn_t)(); +typedef ncclResult_t (*ncclGetVersion_fn_t)(int* version); +typedef ncclResult_t (*ncclGetUniqueId_fn_t)(ncclUniqueId* out); + +typedef ncclResult_t (*ncclCommInitRank_fn_t)(ncclComm_t* newcomm, int nranks, + ncclUniqueId commId, int myrank); + +typedef ncclResult_t (*ncclCommInitAll_fn_t)(ncclComm_t* comms, int ndev, + const int* devlist); + +typedef ncclResult_t (*ncclCommInitRankConfig_fn_t)(ncclComm_t* comm, int nranks, + ncclUniqueId commId, int myrank, + ncclConfig_t* config); + +typedef ncclResult_t (*ncclCommFinalize_fn_t)(ncclComm_t comm); + +typedef ncclResult_t (*ncclCommDestroy_fn_t)(ncclComm_t comm); + +typedef ncclResult_t (*ncclCommAbort_fn_t)(ncclComm_t comm); + +typedef ncclResult_t (*ncclCommSplit_fn_t)(ncclComm_t comm, int color, int key, + ncclComm_t* newcomm, ncclConfig_t* config); + +typedef const char* (*ncclGetErrorString_fn_t)(ncclResult_t code); + +typedef const char* (*ncclGetLastError_fn_t)(const ncclComm_t comm); + +typedef ncclResult_t (*ncclCommGetAsyncError_fn_t)(ncclComm_t comm, + ncclResult_t* asyncError); + +typedef ncclResult_t (*ncclCommCount_fn_t)(const ncclComm_t comm, int* count); + +typedef ncclResult_t (*ncclCommCuDevice_fn_t)(const ncclComm_t comm, int* devid); + +typedef ncclResult_t (*ncclCommUserRank_fn_t)(const ncclComm_t comm, int* rank); + +typedef ncclResult_t (*ncclMemAlloc_fn_t)(void** ptr, size_t size); + +typedef ncclResult_t (*ncclMemFree_fn_t)(void* ptr); + +typedef ncclResult_t (*mscclLoadAlgo_fn_t)(const char* mscclAlgoFilePath, + mscclAlgoHandle_t* mscclAlgoHandle, int rank); + +typedef ncclResult_t (*mscclRunAlgo_fn_t)( + const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], + void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], size_t count, + ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, + mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream); + +typedef ncclResult_t (*mscclUnloadAlgo_fn_t)(mscclAlgoHandle_t mscclAlgoHandle); + +typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm, void* buff, + size_t size, void** handle); + +typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, void* handle); + +typedef struct rcclApiFuncTable +{ + uint64_t size; + ncclAllGather_fn_t ncclAllGather_fn; + ncclAllReduce_fn_t ncclAllReduce_fn; + ncclAllToAll_fn_t ncclAllToAll_fn; + ncclAllToAllv_fn_t ncclAllToAllv_fn; + ncclBroadcast_fn_t ncclBroadcast_fn; + ncclGather_fn_t ncclGather_fn; + ncclReduce_fn_t ncclReduce_fn; + ncclReduceScatter_fn_t ncclReduceScatter_fn; + ncclScatter_fn_t ncclScatter_fn; + ncclSend_fn_t ncclSend_fn; + ncclRecv_fn_t ncclRecv_fn; + ncclRedOpCreatePreMulSum_fn_t ncclRedOpCreatePreMulSum_fn; + ncclRedOpDestroy_fn_t ncclRedOpDestroy_fn; + ncclGroupStart_fn_t ncclGroupStart_fn; + ncclGroupEnd_fn_t ncclGroupEnd_fn; + ncclGetVersion_fn_t ncclGetVersion_fn; + ncclGetUniqueId_fn_t ncclGetUniqueId_fn; + ncclCommInitRank_fn_t ncclCommInitRank_fn; + ncclCommInitAll_fn_t ncclCommInitAll_fn; + ncclCommInitRankConfig_fn_t ncclCommInitRankConfig_fn; + ncclCommFinalize_fn_t ncclCommFinalize_fn; + ncclCommDestroy_fn_t ncclCommDestroy_fn; + ncclCommAbort_fn_t ncclCommAbort_fn; + ncclCommSplit_fn_t ncclCommSplit_fn; + ncclGetErrorString_fn_t ncclGetErrorString_fn; + ncclGetLastError_fn_t ncclGetLastError_fn; + ncclCommGetAsyncError_fn_t ncclCommGetAsyncError_fn; + ncclCommCount_fn_t ncclCommCount_fn; + ncclCommCuDevice_fn_t ncclCommCuDevice_fn; + ncclCommUserRank_fn_t ncclCommUserRank_fn; + ncclMemAlloc_fn_t ncclMemAlloc_fn; + ncclMemFree_fn_t ncclMemFree_fn; + mscclLoadAlgo_fn_t mscclLoadAlgo_fn; + mscclRunAlgo_fn_t mscclRunAlgo_fn; + mscclUnloadAlgo_fn_t mscclUnloadAlgo_fn; + ncclCommRegister_fn_t ncclCommRegister_fn; + ncclCommDeregister_fn_t ncclCommDeregister_fn; + +} rcclApiFuncTable; + +RCCL_EXTERN_C_FINI diff --git a/src/init.cc b/src/init.cc index 2ffb721b8e..c332fd2ba1 100644 --- a/src/init.cc +++ b/src/init.cc @@ -178,14 +178,14 @@ static ncclResult_t ncclInit() { } NCCL_API(ncclResult_t, ncclGetVersion, int* version); -ncclResult_t ncclGetVersion(int* version) { +ncclResult_t ncclGetVersion_impl(int* version) { if (version == NULL) return ncclInvalidArgument; *version = NCCL_VERSION_CODE; return ncclSuccess; } NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out); -ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { +ncclResult_t ncclGetUniqueId_impl(ncclUniqueId* out) { NCCLCHECK(ncclInit()); NCCLCHECK(PtrCheck(out, "GetUniqueId", "out")); ncclResult_t res = bootstrapGetUniqueId((struct ncclBootstrapHandle*)out); @@ -2255,7 +2255,7 @@ constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = { }; NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); -ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { +ncclResult_t ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { // Load the CUDA driver and dlsym hooks (can fail on old drivers) rocmLibraryInit(); @@ -2271,7 +2271,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm } NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist); -ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { +ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlist) { ncclResult_t ret = ncclSuccess; int totalnDev; int *gpuFlags = NULL; @@ -2339,7 +2339,7 @@ ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) { } NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config); -ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) { +ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) { NVTX3_FUNC_RANGE_IN(nccl_domain); int cudaDev; ncclResult_t ret = ncclSuccess; @@ -2462,7 +2462,7 @@ fail: } NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm); -ncclResult_t ncclCommFinalize(ncclComm_t comm) { +ncclResult_t ncclCommFinalize_impl(ncclComm_t comm) { NVTX3_FUNC_RANGE_IN(nccl_domain); ncclResult_t ret = ncclSuccess; @@ -2576,7 +2576,7 @@ fail: } NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); -ncclResult_t ncclCommDestroy(ncclComm_t comm) { +ncclResult_t ncclCommDestroy_impl(ncclComm_t comm) { if (comm == NULL) { NVTX3_FUNC_RANGE_IN(nccl_domain); return ncclSuccess; @@ -2617,7 +2617,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { } NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); -ncclResult_t ncclCommAbort(ncclComm_t comm) { +ncclResult_t ncclCommAbort_impl(ncclComm_t comm) { if (comm == NULL) { NVTX3_FUNC_RANGE_IN(nccl_domain); return ncclSuccess; @@ -2649,7 +2649,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { } NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config); -ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) { +ncclResult_t ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) { struct ncclCommInitRankAsyncJob *job = NULL; struct ncclComm* childComm = NCCL_COMM_NULL; ncclResult_t res = ncclSuccess; @@ -2713,7 +2713,7 @@ fail: } NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); -const char* ncclGetErrorString(ncclResult_t code) { +const char* ncclGetErrorString_impl(ncclResult_t code) { switch (code) { case ncclSuccess : return "no error"; case ncclUnhandledCudaError : return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)"; @@ -2731,12 +2731,12 @@ const char* ncclGetErrorString(ncclResult_t code) { * comm is currently unused and can be set to NULL */ NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm); -const char* ncclGetLastError(ncclComm_t comm) { +const char* ncclGetLastError_impl(ncclComm_t comm) { return ncclLastError; } NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError); -ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { +ncclResult_t ncclCommGetAsyncError_impl(ncclComm_t comm, ncclResult_t *asyncError) { NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm")); NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError")); @@ -2746,7 +2746,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { } NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count); -ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { +ncclResult_t ncclCommCount_impl(const ncclComm_t comm, int* count) { NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECK(PtrCheck(comm, "CommCount", "comm")); @@ -2760,7 +2760,7 @@ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { } NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid); -ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { +ncclResult_t ncclCommCuDevice_impl(const ncclComm_t comm, int* devid) { NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm")); @@ -2773,7 +2773,7 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { } NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank); -ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { +ncclResult_t ncclCommUserRank_impl(const ncclComm_t comm, int* rank) { NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm")); @@ -2786,7 +2786,7 @@ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { } NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size); -ncclResult_t ncclMemAlloc(void **ptr, size_t size) { +ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) { NVTX3_FUNC_RANGE_IN(nccl_domain); ncclResult_t ret = ncclSuccess; @@ -2863,7 +2863,7 @@ fail: } NCCL_API(ncclResult_t, ncclMemFree, void *ptr); -ncclResult_t ncclMemFree(void *ptr) { +ncclResult_t ncclMemFree_impl(void *ptr) { NVTX3_FUNC_RANGE_IN(nccl_domain); ncclResult_t ret = ncclSuccess; int saveDevice; diff --git a/src/misc/api_trace.c b/src/misc/api_trace.c new file mode 100644 index 0000000000..2595a63705 --- /dev/null +++ b/src/misc/api_trace.c @@ -0,0 +1,9 @@ +// +// This file just ensures that api_trace.h is C-compatible +// + +#if defined(__cplusplus) +# error "C source file compiling as C++" +#endif + +#include "api_trace.h" diff --git a/src/misc/api_trace.cc b/src/misc/api_trace.cc new file mode 100644 index 0000000000..92e09eb149 --- /dev/null +++ b/src/misc/api_trace.cc @@ -0,0 +1,656 @@ + +#include "api_trace.h" +#include "core.h" +#include "nccl.h" + +#include +#include +#include +#include + +#if defined(RCCL_ROCPROFILER_REGISTER) && RCCL_ROCPROFILER_REGISTER > 0 +# include + +# define ROCP_REG_VERSION \ + ROCPROFILER_REGISTER_COMPUTE_VERSION_3(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH) + +ROCPROFILER_REGISTER_DEFINE_IMPORT(rccl, ROCP_REG_VERSION) +#endif + +ncclResult_t +ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); + +ncclResult_t +ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, + cudaStream_t stream); + +ncclResult_t +ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); + +ncclResult_t +ncclAllToAllv_impl(const void* sendbuff, const size_t sendcounts[], + const size_t sdispls[], void* recvbuff, const size_t recvcounts[], + const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + +ncclResult_t +ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm_t comm, + cudaStream_t stream); + +ncclResult_t +ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); + +ncclResult_t +ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, + cudaStream_t stream); + +ncclResult_t +ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, + cudaStream_t stream); + +ncclResult_t +ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); + +ncclResult_t +ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); + +ncclResult_t +ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); + +ncclResult_t +ncclRedOpCreatePreMulSum_impl(ncclRedOp_t* op, void* scalar, ncclDataType_t datatype, + ncclScalarResidence_t residence, ncclComm_t comm); + +ncclResult_t +ncclRedOpDestroy_impl(ncclRedOp_t op, ncclComm_t comm); + +ncclResult_t +ncclGroupStart_impl(); + +ncclResult_t +ncclGroupEnd_impl(); + +ncclResult_t +ncclGetVersion_impl(int* version); + +ncclResult_t +ncclGetUniqueId_impl(ncclUniqueId* out); + +ncclResult_t +ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); + +ncclResult_t +ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlist); + +ncclResult_t +ncclCommInitRankConfig_impl(ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, + ncclConfig_t* config); + +ncclResult_t +ncclCommFinalize_impl(ncclComm_t comm); + +ncclResult_t +ncclCommDestroy_impl(ncclComm_t comm); + +ncclResult_t +ncclCommAbort_impl(ncclComm_t comm); + +ncclResult_t +ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, + ncclConfig_t* config); + +const char* +ncclGetErrorString_impl(ncclResult_t code); + +const char* +ncclGetLastError_impl(const ncclComm_t comm); + +ncclResult_t +ncclCommGetAsyncError_impl(ncclComm_t comm, ncclResult_t* asyncError); + +ncclResult_t +ncclCommCount_impl(const ncclComm_t comm, int* count); + +ncclResult_t +ncclCommCuDevice_impl(const ncclComm_t comm, int* devid); + +ncclResult_t +ncclCommUserRank_impl(const ncclComm_t comm, int* rank); + +ncclResult_t +ncclMemAlloc_impl(void** ptr, size_t size); + +ncclResult_t +ncclMemFree_impl(void* ptr); + +ncclResult_t +mscclLoadAlgo_impl(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, + int rank); + +ncclResult_t +mscclRunAlgo_impl(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], + void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], + size_t count, ncclDataType_t dataType, int root, int peer, + ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, + hipStream_t stream); + +ncclResult_t +mscclUnloadAlgo_impl(mscclAlgoHandle_t mscclAlgoHandle); + +ncclResult_t +ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t size, void** handle); + +ncclResult_t +ncclCommDeregister_impl(const ncclComm_t comm, void* handle); + +namespace rccl +{ +namespace +{ + +constexpr size_t +compute_table_offset(size_t n) +{ + return (sizeof(uint64_t) + (n * sizeof(void*))); +} + +constexpr size_t +compute_table_size(size_t nmembers) +{ + return (sizeof(uint64_t) + (nmembers * sizeof(void*))); +} + +#define RCCL_ASSERT_OFFSET(TABLE, MEMBER, IDX) \ + static_assert(offsetof(TABLE, MEMBER) == compute_table_offset(IDX), \ + "Do not re-arrange the table members") + +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllGather_fn, 0); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllReduce_fn, 1); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllToAll_fn, 2); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllToAllv_fn, 3); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclBroadcast_fn, 4); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGather_fn, 5); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclReduce_fn, 6); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclReduceScatter_fn, 7); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclScatter_fn, 8); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclSend_fn, 9); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclRecv_fn, 10); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclRedOpCreatePreMulSum_fn, 11); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclRedOpDestroy_fn, 12); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGroupStart_fn, 13); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGroupEnd_fn, 14); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetVersion_fn, 15); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetUniqueId_fn, 16); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommInitRank_fn, 17); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommInitAll_fn, 18); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommInitRankConfig_fn, 19); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommFinalize_fn, 20); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommDestroy_fn, 21); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommAbort_fn, 22); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommSplit_fn, 23); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetErrorString_fn, 24); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetLastError_fn, 25); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommGetAsyncError_fn, 26); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommCount_fn, 27); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommCuDevice_fn, 28); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommUserRank_fn, 29); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclMemAlloc_fn, 30); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclMemFree_fn, 31); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclLoadAlgo_fn, 32); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclRunAlgo_fn, 33); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclUnloadAlgo_fn, 34); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommRegister_fn, 35); +RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommDeregister_fn, 36); + +#undef RCCL_ASSERT_OFFSET + +static_assert(sizeof(rcclApiFuncTable) == compute_table_size(37), + "Update table major/step version and add a new offset assertion if this " + "fails to compile"); + +std::array m_buffer = {}; + +rcclApiFuncTable* +RcclGetFunctionTable_impl() +{ + static auto* tbl = + new(m_buffer.data()) rcclApiFuncTable{ sizeof(rcclApiFuncTable), + &ncclAllGather_impl, + &ncclAllReduce_impl, + &ncclAllToAll_impl, + &ncclAllToAllv_impl, + &ncclBroadcast_impl, + &ncclGather_impl, + &ncclReduce_impl, + &ncclReduceScatter_impl, + &ncclScatter_impl, + &ncclSend_impl, + &ncclRecv_impl, + &ncclRedOpCreatePreMulSum_impl, + &ncclRedOpDestroy_impl, + &ncclGroupStart_impl, + &ncclGroupEnd_impl, + &ncclGetVersion_impl, + &ncclGetUniqueId_impl, + &ncclCommInitRank_impl, + &ncclCommInitAll_impl, + &ncclCommInitRankConfig_impl, + &ncclCommFinalize_impl, + &ncclCommDestroy_impl, + &ncclCommAbort_impl, + &ncclCommSplit_impl, + &ncclGetErrorString_impl, + &ncclGetLastError_impl, + &ncclCommGetAsyncError_impl, + &ncclCommCount_impl, + &ncclCommCuDevice_impl, + &ncclCommUserRank_impl, + &ncclMemAlloc_impl, + &ncclMemFree_impl, + &mscclLoadAlgo_impl, + &mscclRunAlgo_impl, + &mscclUnloadAlgo_impl, + &ncclCommRegister_impl, + &ncclCommDeregister_impl }; + +#if defined(RCCL_ROCPROFILER_REGISTER) && RCCL_ROCPROFILER_REGISTER > 0 + std::array table_array{ tbl }; + rocprofiler_register_library_indentifier_t lib_id = + rocprofiler_register_library_indentifier_t{}; + rocprofiler_register_error_code_t rocp_reg_status = + rocprofiler_register_library_api_table( + "rccl", &ROCPROFILER_REGISTER_IMPORT_FUNC(rccl), ROCP_REG_VERSION, + table_array.data(), table_array.size(), &lib_id); + + INFO(NCCL_COLL, + "[rocprofiler-sdk-rccl][ = %d ] rocprofiler-register returned code = %d : %s", + getpid(), rocp_reg_status, rocprofiler_register_error_string(rocp_reg_status)); + + if(rocp_reg_status != ROCP_REG_SUCCESS && rocp_reg_status != ROCP_REG_NO_TOOLS) + WARN("[rocprofiler-sdk-rccl][%d] rocprofiler-register failed with error code %d " + ": %s", + getpid(), rocp_reg_status, + rocprofiler_register_error_string(rocp_reg_status)); +#endif + + return tbl; +} +} // end of namespace + +const rcclApiFuncTable* +RcclGetFunctionTable() +{ + static const auto* tbl = RcclGetFunctionTable_impl(); + return tbl; +} +} // end of namespace rccl + +NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, + size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); + +NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream); + +NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); + +NCCL_API(ncclResult_t, ncclAllToAllv, const void* sendbuff, const size_t sendcounts[], + const size_t sdispls[], void* recvbuff, const size_t recvcounts[], + const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + +NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); + +NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); + +NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, + hipStream_t stream); + +NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, + hipStream_t stream); + +NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + +NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, + ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream); + +NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, + int peer, ncclComm_t comm, hipStream_t stream); + +NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t* op, void* scalar, + ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); + +NCCL_API(ncclResult_t, ncclRedOpDestroy, ncclRedOp_t op, ncclComm_t comm); + +NCCL_API(ncclResult_t, ncclGroupStart); + +NCCL_API(ncclResult_t, ncclGroupEnd); + +NCCL_API(ncclResult_t, ncclGetVersion, int* version); + +NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out); + +NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, + ncclUniqueId commId, int myrank); + +NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist); + +NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, + ncclUniqueId commId, int myrank, ncclConfig_t* config); + +NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm); + +NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); + +NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); + +NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, + ncclComm_t* newcomm, ncclConfig_t* config); + +NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); + +NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm); + +NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t* asyncError); + +NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count); + +NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid); + +NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank); + +NCCL_API(ncclResult_t, ncclMemAlloc, void** ptr, size_t size); + +NCCL_API(ncclResult_t, ncclMemFree, void* ptr); + +NCCL_API(ncclResult_t, mscclLoadAlgo, const char* mscclAlgoFilePath, + mscclAlgoHandle_t* mscclAlgoHandle, int rank); + +NCCL_API(ncclResult_t, mscclRunAlgo, const void* sendBuff, const size_t sendCounts[], + const size_t sDisPls[], void* recvBuff, const size_t recvCounts[], + const size_t rDisPls[], size_t count, ncclDataType_t dataType, int root, + int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, + hipStream_t stream); + +NCCL_API(ncclResult_t, mscclUnloadAlgo, mscclAlgoHandle_t mscclAlgoHandle); + +NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, + void** handle); + +NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle); + +ncclResult_t +ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) +{ + return ::rccl::RcclGetFunctionTable()->ncclAllGather_fn(sendbuff, recvbuff, sendcount, + datatype, comm, stream); +} + +ncclResult_t +ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) +{ + return ::rccl::RcclGetFunctionTable()->ncclAllReduce_fn(sendbuff, recvbuff, count, + datatype, op, comm, stream); +} + +ncclResult_t +ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclComm_t comm, hipStream_t stream) +{ + return ::rccl::RcclGetFunctionTable()->ncclAllToAll_fn(sendbuff, recvbuff, count, + datatype, comm, stream); +} + +ncclResult_t +ncclAllToAllv(const void* sendbuff, const size_t sendcounts[], const size_t sdispls[], + void* recvbuff, const size_t recvcounts[], const size_t rdispls[], + ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) +{ + return ::rccl::RcclGetFunctionTable()->ncclAllToAllv_fn(sendbuff, sendcounts, sdispls, + recvbuff, recvcounts, rdispls, + datatype, comm, stream); +} + +ncclResult_t +ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + int root, ncclComm_t comm, cudaStream_t stream) +{ + return ::rccl::RcclGetFunctionTable()->ncclBroadcast_fn(sendbuff, recvbuff, count, + datatype, root, comm, stream); +} + +ncclResult_t +ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) +{ + return ::rccl::RcclGetFunctionTable()->ncclGather_fn(sendbuff, recvbuff, sendcount, + datatype, root, comm, stream); +} + +ncclResult_t +ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) +{ + return ::rccl::RcclGetFunctionTable()->ncclReduce_fn( + sendbuff, recvbuff, count, datatype, op, root, comm, stream); +} + +ncclResult_t +ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, + cudaStream_t stream) +{ + return ::rccl::RcclGetFunctionTable()->ncclReduceScatter_fn( + sendbuff, recvbuff, recvcount, datatype, op, comm, stream); +} + +ncclResult_t +ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) +{ + return ::rccl::RcclGetFunctionTable()->ncclScatter_fn(sendbuff, recvbuff, recvcount, + datatype, root, comm, stream); +} + +ncclResult_t +ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream) +{ + return ::rccl::RcclGetFunctionTable()->ncclSend_fn(sendbuff, count, datatype, peer, + comm, stream); +} + +ncclResult_t +ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, + cudaStream_t stream) +{ + return ::rccl::RcclGetFunctionTable()->ncclRecv_fn(recvbuff, count, datatype, peer, + comm, stream); +} + +ncclResult_t +ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, ncclDataType_t datatype, + ncclScalarResidence_t residence, ncclComm_t comm) +{ + return ::rccl::RcclGetFunctionTable()->ncclRedOpCreatePreMulSum_fn( + op, scalar, datatype, residence, comm); +} + +ncclResult_t +ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) +{ + return ::rccl::RcclGetFunctionTable()->ncclRedOpDestroy_fn(op, comm); +} + +ncclResult_t +ncclGroupStart() +{ + return ::rccl::RcclGetFunctionTable()->ncclGroupStart_fn(); +} + +ncclResult_t +ncclGroupEnd() +{ + return ::rccl::RcclGetFunctionTable()->ncclGroupEnd_fn(); +} + +ncclResult_t +ncclGetVersion(int* version) +{ + return ::rccl::RcclGetFunctionTable()->ncclGetVersion_fn(version); +} + +ncclResult_t +ncclGetUniqueId(ncclUniqueId* out) +{ + return ::rccl::RcclGetFunctionTable()->ncclGetUniqueId_fn(out); +} + +ncclResult_t +ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommInitRank_fn(newcomm, nranks, commId, + myrank); +} + +ncclResult_t +ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommInitAll_fn(comms, ndev, devlist); +} + +ncclResult_t +ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, + ncclConfig_t* config) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommInitRankConfig_fn(comm, nranks, commId, + myrank, config); +} + +ncclResult_t +ncclCommFinalize(ncclComm_t comm) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommFinalize_fn(comm); +} + +ncclResult_t +ncclCommDestroy(ncclComm_t comm) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommDestroy_fn(comm); +} + +ncclResult_t +ncclCommAbort(ncclComm_t comm) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommAbort_fn(comm); +} + +ncclResult_t +ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, + ncclConfig_t* config) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommSplit_fn(comm, color, key, newcomm, + config); +} + +const char* +ncclGetErrorString(ncclResult_t code) +{ + return ::rccl::RcclGetFunctionTable()->ncclGetErrorString_fn(code); +} + +const char* +ncclGetLastError(const ncclComm_t comm) +{ + return ::rccl::RcclGetFunctionTable()->ncclGetLastError_fn(comm); +} + +ncclResult_t +ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommGetAsyncError_fn(comm, asyncError); +} + +ncclResult_t +ncclCommCount(const ncclComm_t comm, int* count) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommCount_fn(comm, count); +} + +ncclResult_t +ncclCommCuDevice(const ncclComm_t comm, int* devid) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommCuDevice_fn(comm, devid); +} + +ncclResult_t +ncclCommUserRank(const ncclComm_t comm, int* rank) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommUserRank_fn(comm, rank); +} + +ncclResult_t +ncclMemAlloc(void** ptr, size_t size) +{ + return ::rccl::RcclGetFunctionTable()->ncclMemAlloc_fn(ptr, size); +} + +ncclResult_t +ncclMemFree(void* ptr) +{ + return ::rccl::RcclGetFunctionTable()->ncclMemFree_fn(ptr); +} + +ncclResult_t +mscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank) +{ + return ::rccl::RcclGetFunctionTable()->mscclLoadAlgo_fn(mscclAlgoFilePath, + mscclAlgoHandle, rank); +} + +ncclResult_t +mscclRunAlgo(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], + void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], + size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, + mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream) +{ + return ::rccl::RcclGetFunctionTable()->mscclRunAlgo_fn( + sendBuff, sendCounts, sDisPls, recvBuff, recvCounts, rDisPls, count, dataType, + root, peer, op, mscclAlgoHandle, comm, stream); +} + +ncclResult_t +mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle) +{ + return ::rccl::RcclGetFunctionTable()->mscclUnloadAlgo_fn(mscclAlgoHandle); +} + +ncclResult_t +ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommRegister_fn(comm, buff, size, handle); +} + +ncclResult_t +ncclCommDeregister(const ncclComm_t comm, void* handle) +{ + return ::rccl::RcclGetFunctionTable()->ncclCommDeregister_fn(comm, handle); +} diff --git a/src/msccl.cc b/src/msccl.cc index e74e9a8b09..e912703d13 100644 --- a/src/msccl.cc +++ b/src/msccl.cc @@ -7,11 +7,12 @@ #include "msccl/msccl_parser.h" #include "msccl/msccl_setup.h" #include "msccl/msccl_status.h" +#include "api_trace.h" #include #include NCCL_API(ncclResult_t, mscclLoadAlgo, const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); -ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank) { +ncclResult_t mscclLoadAlgo_impl(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank) { mscclStatus& status = mscclGetStatus(rank); if (status.freeAlgoHandles.size() == 0) { @@ -39,7 +40,7 @@ NCCL_API(ncclResult_t, mscclRunAlgo, void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream); -ncclResult_t mscclRunAlgo( +ncclResult_t mscclRunAlgo_impl( const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, @@ -76,7 +77,7 @@ ncclResult_t mscclRunAlgo( } NCCL_API(ncclResult_t, mscclUnloadAlgo, mscclAlgoHandle_t mscclAlgoHandle); -ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle) { +ncclResult_t mscclUnloadAlgo_impl(mscclAlgoHandle_t mscclAlgoHandle) { // deprecated return ncclSuccess; } diff --git a/src/register.cc b/src/register.cc index 0e252a2f20..c29dd55880 100644 --- a/src/register.cc +++ b/src/register.cc @@ -9,6 +9,7 @@ #include "comm.h" #include "net.h" #include "register.h" +#include "api_trace.h" ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) { struct ncclRegCache* cache = &comm->regCache; @@ -151,7 +152,7 @@ ncclResult_t ncclRegCleanup(struct ncclComm* comm) { } NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle); -ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) { +ncclResult_t ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t size, void** handle) { NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm")); if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister")); NCCLCHECK(ncclRegister(comm, buff, size, handle)); @@ -159,7 +160,7 @@ ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, vo } NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle); -ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) { +ncclResult_t ncclCommDeregister_impl(const ncclComm_t comm, void* handle) { NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm")); struct ncclReg* reg = (struct ncclReg*)handle; struct ncclRegCache* cache = &comm->regCache; diff --git a/test/common/PtrUnion.cpp b/test/common/PtrUnion.cpp index cb0887164b..7ed1558f1e 100644 --- a/test/common/PtrUnion.cpp +++ b/test/common/PtrUnion.cpp @@ -5,7 +5,7 @@ ************************************************************************/ #include "PtrUnion.hpp" - +#include "api_trace.h" namespace RcclUnitTesting { size_t DataTypeToBytes(ncclDataType_t const dataType)