adding all nccl apis to api_support to enable rccl tracing by rocprofv3 (#1297)
* adding all nccl apis to api_support to enable rccl tracing by rocprofv3 Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com> Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
This commit is contained in:
@@ -386,6 +386,7 @@ set(SRC_FILES
|
||||
src/include/align.h
|
||||
src/include/alloc.h
|
||||
src/include/archinfo.h
|
||||
src/include/api_trace.h
|
||||
src/include/argcheck.h
|
||||
src/include/BfdBacktrace.hpp
|
||||
src/include/bootstrap.h
|
||||
@@ -473,6 +474,8 @@ set(SRC_FILES
|
||||
src/misc/alt_rsmi.cc
|
||||
src/misc/archinfo.cc
|
||||
src/misc/argcheck.cc
|
||||
src/misc/api_trace.c
|
||||
src/misc/api_trace.cc
|
||||
# src/misc/cudawrap.cc
|
||||
# src/misc/gdrwrap.cc
|
||||
src/misc/ibvsymbols.cc
|
||||
@@ -691,6 +694,21 @@ endif()
|
||||
## Set RCCL linked library directories
|
||||
target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR})
|
||||
|
||||
if (ROCM_VERSION VERSION_GREATER_EQUAL "60100")
|
||||
option(RCCL_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
|
||||
else()
|
||||
if(RCCL_ROCPROFILER_REGISTER)
|
||||
message(AUTHOR_WARNING "RCCL_ROCPROFILER_REGISTER is not valid option for ROCm < 6.2. Current ROCm version: ${ROCM_VERSION}")
|
||||
endif()
|
||||
set(RCCL_ROCPROFILER_REGISTER OFF CACHE BOOL "" FORCE)
|
||||
endif()
|
||||
if(RCCL_ROCPROFILER_REGISTER)
|
||||
find_package(rocprofiler-register REQUIRED)
|
||||
target_compile_definitions(rccl PRIVATE RCCL_ROCPROFILER_REGISTER=1)
|
||||
target_link_libraries(
|
||||
rccl PRIVATE rocprofiler-register::rocprofiler-register)
|
||||
endif()
|
||||
|
||||
## Set RCCL linked libraries
|
||||
if (HAVE_BFD)
|
||||
target_link_libraries(rccl PRIVATE bfd)
|
||||
@@ -766,6 +784,8 @@ rocm_set_soversion(rccl "1.0")
|
||||
rocm_install_targets(TARGETS rccl)
|
||||
rocm_install(FILES ${PROJECT_BINARY_DIR}/include/rccl/rccl.h src/include/nccl_net.h
|
||||
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl)
|
||||
rocm_install(FILES src/include/api_trace.h
|
||||
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl/amd_detail)
|
||||
file(COPY tools/msccl-algorithms DESTINATION ${PROJECT_BINARY_DIR})
|
||||
file(COPY tools/msccl-unit-test-algorithms DESTINATION ${PROJECT_BINARY_DIR})
|
||||
## Install Algorithm files under share folder
|
||||
|
||||
+29
-11
@@ -9,12 +9,14 @@
|
||||
#include "enqueue.h"
|
||||
#include "graph/topo.h"
|
||||
#include "nccl.h"
|
||||
#include "api_trace.h"
|
||||
|
||||
#include "msccl/msccl_lifecycle.h"
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
|
||||
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
|
||||
ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
|
||||
// Just pass the size of one message and not the total bytes sent/received.
|
||||
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
|
||||
@@ -37,7 +39,9 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
|
||||
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
|
||||
|
||||
ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
|
||||
struct NvtxParamsAllReduce {
|
||||
size_t bytes;
|
||||
@@ -66,7 +70,9 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
|
||||
|
||||
ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
// Just pass the size of one message and not the total bytes sent/received.
|
||||
constexpr nvtxPayloadSchemaEntry_t AllToAllSchema[] = {
|
||||
@@ -107,7 +113,9 @@ ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, nc
|
||||
NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
|
||||
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
|
||||
|
||||
|
||||
ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
|
||||
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
|
||||
struct NvtxParamsAllToAllv {
|
||||
@@ -153,7 +161,8 @@ ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], cons
|
||||
|
||||
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, cudaStream_t stream);
|
||||
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
|
||||
ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, cudaStream_t stream) {
|
||||
struct NvtxParamsBroadcast {
|
||||
size_t bytes;
|
||||
@@ -187,7 +196,8 @@ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int ro
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
|
||||
ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
struct NvtxParamsGather {
|
||||
size_t bytes;
|
||||
@@ -224,7 +234,8 @@ ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
||||
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
|
||||
ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
struct NvtxParamsReduce {
|
||||
size_t bytes;
|
||||
@@ -254,7 +265,9 @@ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
|
||||
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
|
||||
|
||||
ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
|
||||
struct NvtxParamsReduceScatter {
|
||||
size_t bytes;
|
||||
@@ -282,7 +295,9 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv
|
||||
|
||||
NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
|
||||
|
||||
|
||||
ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
struct NvtxParamsScatter {
|
||||
size_t bytes;
|
||||
@@ -328,7 +343,9 @@ constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
|
||||
|
||||
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream);
|
||||
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
|
||||
|
||||
ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream) {
|
||||
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
|
||||
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
|
||||
@@ -351,7 +368,8 @@ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatyp
|
||||
|
||||
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream);
|
||||
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
|
||||
ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream) {
|
||||
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
|
||||
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
|
||||
|
||||
+3
-2
@@ -20,6 +20,7 @@
|
||||
#include "rccl_vars.h"
|
||||
#include "transport.h"
|
||||
#include "common.h"
|
||||
#include "api_trace.h"
|
||||
#include <cassert>
|
||||
#include <cstring> // std::memcpy
|
||||
#include <cinttypes> // PRIx64
|
||||
@@ -2170,7 +2171,7 @@ fail:
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
||||
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) {
|
||||
ncclResult_t ncclRedOpCreatePreMulSum_impl(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclRedOpCreatePreMulSum", "comm"));
|
||||
/* join init thread before creating PreMulSum op. */
|
||||
NCCLCHECK(ncclCommEnsureReady(comm));
|
||||
@@ -2209,7 +2210,7 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclRedOpDestroy, ncclRedOp_t op, ncclComm_t comm);
|
||||
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) {
|
||||
ncclResult_t ncclRedOpDestroy_impl(ncclRedOp_t op, ncclComm_t comm) {
|
||||
if (0 <= int(op) && int(op) < int(ncclNumOps)) {
|
||||
WARN("ncclRedOpDestroy : operator is a NCCL builtin.");
|
||||
return ncclInvalidArgument;
|
||||
|
||||
+3
-2
@@ -11,6 +11,7 @@
|
||||
#include "enqueue.h"
|
||||
#include "transport.h"
|
||||
#include "channel.h"
|
||||
#include "api_trace.h"
|
||||
#include <assert.h>
|
||||
|
||||
#include "msccl/msccl_lifecycle.h"
|
||||
@@ -83,7 +84,7 @@ ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) {
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupStart);
|
||||
ncclResult_t ncclGroupStart() {
|
||||
ncclResult_t ncclGroupStart_impl() {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
@@ -101,7 +102,7 @@ ncclResult_t ncclGroupStartInternal() {
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupEnd);
|
||||
ncclResult_t ncclGroupEnd() {
|
||||
ncclResult_t ncclGroupEnd_impl() {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
NCCLCHECKGOTO(ncclGroupEndInternal(), ret, exit);
|
||||
|
||||
@@ -0,0 +1,200 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "nccl.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// should only be increased if fundamental changes to dispatch table(s)
|
||||
#define RCCL_API_TRACE_VERSION_MAJOR 0
|
||||
|
||||
// should be increased every time new members are added to existing dispatch tables
|
||||
#define RCCL_API_TRACE_VERSION_PATCH 0
|
||||
|
||||
#if !defined(RCCL_EXTERN_C_INIT)
|
||||
# ifdef __cplusplus
|
||||
# define RCCL_EXTERN_C_INIT \
|
||||
extern "C" \
|
||||
{
|
||||
# else
|
||||
# define RCCL_EXTERN_C_INIT
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if !defined(RCCL_EXTERN_C_FINI)
|
||||
# ifdef __cplusplus
|
||||
# define RCCL_EXTERN_C_FINI }
|
||||
# else
|
||||
# define RCCL_EXTERN_C_FINI
|
||||
# endif
|
||||
#endif
|
||||
|
||||
RCCL_EXTERN_C_INIT
|
||||
|
||||
typedef uint64_t rccl_range_id_t;
|
||||
typedef ncclResult_t (*ncclAllGather_fn_t)(const void* sendbuff, void* recvbuff,
|
||||
size_t sendcount, ncclDataType_t datatype,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
typedef ncclResult_t (*ncclAllReduce_fn_t)(const void* sendbuff, void* recvbuff,
|
||||
size_t count, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, struct ncclComm* comm,
|
||||
cudaStream_t stream);
|
||||
typedef ncclResult_t (*ncclAllToAll_fn_t)(const void* sendbuff, void* recvbuff,
|
||||
size_t count, ncclDataType_t datatype,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
typedef ncclResult_t (*ncclAllToAllv_fn_t)(
|
||||
const void* sendbuff, const size_t sendcounts[], const size_t sdispls[],
|
||||
void* recvbuff, const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
typedef ncclResult_t (*ncclBroadcast_fn_t)(const void* sendbuff, void* recvbuff,
|
||||
size_t count, ncclDataType_t datatype,
|
||||
int root, ncclComm_t comm,
|
||||
cudaStream_t stream);
|
||||
typedef ncclResult_t (*ncclGather_fn_t)(const void* sendbuff, void* recvbuff,
|
||||
size_t sendcount, ncclDataType_t datatype,
|
||||
int root, ncclComm_t comm, hipStream_t stream);
|
||||
typedef ncclResult_t (*ncclReduce_fn_t)(const void* sendbuff, void* recvbuff,
|
||||
size_t count, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm,
|
||||
cudaStream_t stream);
|
||||
typedef ncclResult_t (*ncclReduceScatter_fn_t)(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, struct ncclComm* comm,
|
||||
cudaStream_t stream);
|
||||
typedef ncclResult_t (*ncclScatter_fn_t)(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype,
|
||||
int root, ncclComm_t comm, hipStream_t stream);
|
||||
typedef ncclResult_t (*ncclSend_fn_t)(const void* sendbuff, size_t count,
|
||||
ncclDataType_t datatype, int peer, ncclComm_t comm,
|
||||
cudaStream_t stream);
|
||||
typedef ncclResult_t (*ncclRecv_fn_t)(void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, int peer, ncclComm_t comm,
|
||||
cudaStream_t stream);
|
||||
typedef ncclResult_t (*ncclRedOpCreatePreMulSum_fn_t)(ncclRedOp_t* op, void* scalar,
|
||||
ncclDataType_t datatype,
|
||||
ncclScalarResidence_t residence,
|
||||
ncclComm_t comm);
|
||||
typedef ncclResult_t (*ncclRedOpDestroy_fn_t)(ncclRedOp_t op, ncclComm_t comm);
|
||||
typedef ncclResult_t (*ncclGroupStart_fn_t)();
|
||||
typedef ncclResult_t (*ncclGroupEnd_fn_t)();
|
||||
typedef ncclResult_t (*ncclGetVersion_fn_t)(int* version);
|
||||
typedef ncclResult_t (*ncclGetUniqueId_fn_t)(ncclUniqueId* out);
|
||||
|
||||
typedef ncclResult_t (*ncclCommInitRank_fn_t)(ncclComm_t* newcomm, int nranks,
|
||||
ncclUniqueId commId, int myrank);
|
||||
|
||||
typedef ncclResult_t (*ncclCommInitAll_fn_t)(ncclComm_t* comms, int ndev,
|
||||
const int* devlist);
|
||||
|
||||
typedef ncclResult_t (*ncclCommInitRankConfig_fn_t)(ncclComm_t* comm, int nranks,
|
||||
ncclUniqueId commId, int myrank,
|
||||
ncclConfig_t* config);
|
||||
|
||||
typedef ncclResult_t (*ncclCommFinalize_fn_t)(ncclComm_t comm);
|
||||
|
||||
typedef ncclResult_t (*ncclCommDestroy_fn_t)(ncclComm_t comm);
|
||||
|
||||
typedef ncclResult_t (*ncclCommAbort_fn_t)(ncclComm_t comm);
|
||||
|
||||
typedef ncclResult_t (*ncclCommSplit_fn_t)(ncclComm_t comm, int color, int key,
|
||||
ncclComm_t* newcomm, ncclConfig_t* config);
|
||||
|
||||
typedef const char* (*ncclGetErrorString_fn_t)(ncclResult_t code);
|
||||
|
||||
typedef const char* (*ncclGetLastError_fn_t)(const ncclComm_t comm);
|
||||
|
||||
typedef ncclResult_t (*ncclCommGetAsyncError_fn_t)(ncclComm_t comm,
|
||||
ncclResult_t* asyncError);
|
||||
|
||||
typedef ncclResult_t (*ncclCommCount_fn_t)(const ncclComm_t comm, int* count);
|
||||
|
||||
typedef ncclResult_t (*ncclCommCuDevice_fn_t)(const ncclComm_t comm, int* devid);
|
||||
|
||||
typedef ncclResult_t (*ncclCommUserRank_fn_t)(const ncclComm_t comm, int* rank);
|
||||
|
||||
typedef ncclResult_t (*ncclMemAlloc_fn_t)(void** ptr, size_t size);
|
||||
|
||||
typedef ncclResult_t (*ncclMemFree_fn_t)(void* ptr);
|
||||
|
||||
typedef ncclResult_t (*mscclLoadAlgo_fn_t)(const char* mscclAlgoFilePath,
|
||||
mscclAlgoHandle_t* mscclAlgoHandle, int rank);
|
||||
|
||||
typedef ncclResult_t (*mscclRunAlgo_fn_t)(
|
||||
const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
|
||||
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], size_t count,
|
||||
ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
|
||||
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
typedef ncclResult_t (*mscclUnloadAlgo_fn_t)(mscclAlgoHandle_t mscclAlgoHandle);
|
||||
|
||||
typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm, void* buff,
|
||||
size_t size, void** handle);
|
||||
|
||||
typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, void* handle);
|
||||
|
||||
typedef struct rcclApiFuncTable
|
||||
{
|
||||
uint64_t size;
|
||||
ncclAllGather_fn_t ncclAllGather_fn;
|
||||
ncclAllReduce_fn_t ncclAllReduce_fn;
|
||||
ncclAllToAll_fn_t ncclAllToAll_fn;
|
||||
ncclAllToAllv_fn_t ncclAllToAllv_fn;
|
||||
ncclBroadcast_fn_t ncclBroadcast_fn;
|
||||
ncclGather_fn_t ncclGather_fn;
|
||||
ncclReduce_fn_t ncclReduce_fn;
|
||||
ncclReduceScatter_fn_t ncclReduceScatter_fn;
|
||||
ncclScatter_fn_t ncclScatter_fn;
|
||||
ncclSend_fn_t ncclSend_fn;
|
||||
ncclRecv_fn_t ncclRecv_fn;
|
||||
ncclRedOpCreatePreMulSum_fn_t ncclRedOpCreatePreMulSum_fn;
|
||||
ncclRedOpDestroy_fn_t ncclRedOpDestroy_fn;
|
||||
ncclGroupStart_fn_t ncclGroupStart_fn;
|
||||
ncclGroupEnd_fn_t ncclGroupEnd_fn;
|
||||
ncclGetVersion_fn_t ncclGetVersion_fn;
|
||||
ncclGetUniqueId_fn_t ncclGetUniqueId_fn;
|
||||
ncclCommInitRank_fn_t ncclCommInitRank_fn;
|
||||
ncclCommInitAll_fn_t ncclCommInitAll_fn;
|
||||
ncclCommInitRankConfig_fn_t ncclCommInitRankConfig_fn;
|
||||
ncclCommFinalize_fn_t ncclCommFinalize_fn;
|
||||
ncclCommDestroy_fn_t ncclCommDestroy_fn;
|
||||
ncclCommAbort_fn_t ncclCommAbort_fn;
|
||||
ncclCommSplit_fn_t ncclCommSplit_fn;
|
||||
ncclGetErrorString_fn_t ncclGetErrorString_fn;
|
||||
ncclGetLastError_fn_t ncclGetLastError_fn;
|
||||
ncclCommGetAsyncError_fn_t ncclCommGetAsyncError_fn;
|
||||
ncclCommCount_fn_t ncclCommCount_fn;
|
||||
ncclCommCuDevice_fn_t ncclCommCuDevice_fn;
|
||||
ncclCommUserRank_fn_t ncclCommUserRank_fn;
|
||||
ncclMemAlloc_fn_t ncclMemAlloc_fn;
|
||||
ncclMemFree_fn_t ncclMemFree_fn;
|
||||
mscclLoadAlgo_fn_t mscclLoadAlgo_fn;
|
||||
mscclRunAlgo_fn_t mscclRunAlgo_fn;
|
||||
mscclUnloadAlgo_fn_t mscclUnloadAlgo_fn;
|
||||
ncclCommRegister_fn_t ncclCommRegister_fn;
|
||||
ncclCommDeregister_fn_t ncclCommDeregister_fn;
|
||||
|
||||
} rcclApiFuncTable;
|
||||
|
||||
RCCL_EXTERN_C_FINI
|
||||
+17
-17
@@ -178,14 +178,14 @@ static ncclResult_t ncclInit() {
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGetVersion, int* version);
|
||||
ncclResult_t ncclGetVersion(int* version) {
|
||||
ncclResult_t ncclGetVersion_impl(int* version) {
|
||||
if (version == NULL) return ncclInvalidArgument;
|
||||
*version = NCCL_VERSION_CODE;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
|
||||
ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
|
||||
ncclResult_t ncclGetUniqueId_impl(ncclUniqueId* out) {
|
||||
NCCLCHECK(ncclInit());
|
||||
NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
|
||||
ncclResult_t res = bootstrapGetUniqueId((struct ncclBootstrapHandle*)out);
|
||||
@@ -2255,7 +2255,7 @@ constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = {
|
||||
};
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
|
||||
ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
|
||||
ncclResult_t ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
|
||||
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
|
||||
rocmLibraryInit();
|
||||
|
||||
@@ -2271,7 +2271,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
|
||||
ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
||||
ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlist) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int totalnDev;
|
||||
int *gpuFlags = NULL;
|
||||
@@ -2339,7 +2339,7 @@ ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config);
|
||||
ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) {
|
||||
ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
int cudaDev;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
@@ -2462,7 +2462,7 @@ fail:
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm);
|
||||
ncclResult_t ncclCommFinalize(ncclComm_t comm) {
|
||||
ncclResult_t ncclCommFinalize_impl(ncclComm_t comm) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
@@ -2576,7 +2576,7 @@ fail:
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
|
||||
ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
||||
ncclResult_t ncclCommDestroy_impl(ncclComm_t comm) {
|
||||
if (comm == NULL) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
return ncclSuccess;
|
||||
@@ -2617,7 +2617,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
|
||||
ncclResult_t ncclCommAbort(ncclComm_t comm) {
|
||||
ncclResult_t ncclCommAbort_impl(ncclComm_t comm) {
|
||||
if (comm == NULL) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
return ncclSuccess;
|
||||
@@ -2649,7 +2649,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
|
||||
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
|
||||
ncclResult_t ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
|
||||
struct ncclCommInitRankAsyncJob *job = NULL;
|
||||
struct ncclComm* childComm = NCCL_COMM_NULL;
|
||||
ncclResult_t res = ncclSuccess;
|
||||
@@ -2713,7 +2713,7 @@ fail:
|
||||
}
|
||||
|
||||
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
|
||||
const char* ncclGetErrorString(ncclResult_t code) {
|
||||
const char* ncclGetErrorString_impl(ncclResult_t code) {
|
||||
switch (code) {
|
||||
case ncclSuccess : return "no error";
|
||||
case ncclUnhandledCudaError : return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)";
|
||||
@@ -2731,12 +2731,12 @@ const char* ncclGetErrorString(ncclResult_t code) {
|
||||
* comm is currently unused and can be set to NULL
|
||||
*/
|
||||
NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm);
|
||||
const char* ncclGetLastError(ncclComm_t comm) {
|
||||
const char* ncclGetLastError_impl(ncclComm_t comm) {
|
||||
return ncclLastError;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
|
||||
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
|
||||
ncclResult_t ncclCommGetAsyncError_impl(ncclComm_t comm, ncclResult_t *asyncError) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
|
||||
NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
|
||||
|
||||
@@ -2746,7 +2746,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
|
||||
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
|
||||
ncclResult_t ncclCommCount_impl(const ncclComm_t comm, int* count) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
|
||||
@@ -2760,7 +2760,7 @@ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
|
||||
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
|
||||
ncclResult_t ncclCommCuDevice_impl(const ncclComm_t comm, int* devid) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
|
||||
@@ -2773,7 +2773,7 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
|
||||
ncclResult_t ncclCommUserRank_impl(const ncclComm_t comm, int* rank) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
|
||||
@@ -2786,7 +2786,7 @@ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
|
||||
ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
|
||||
ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
@@ -2863,7 +2863,7 @@ fail:
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
|
||||
ncclResult_t ncclMemFree(void *ptr) {
|
||||
ncclResult_t ncclMemFree_impl(void *ptr) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int saveDevice;
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
//
|
||||
// This file just ensures that api_trace.h is C-compatible
|
||||
//
|
||||
|
||||
#if defined(__cplusplus)
|
||||
# error "C source file compiling as C++"
|
||||
#endif
|
||||
|
||||
#include "api_trace.h"
|
||||
@@ -0,0 +1,656 @@
|
||||
|
||||
#include "api_trace.h"
|
||||
#include "core.h"
|
||||
#include "nccl.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#if defined(RCCL_ROCPROFILER_REGISTER) && RCCL_ROCPROFILER_REGISTER > 0
|
||||
# include <rocprofiler-register/rocprofiler-register.h>
|
||||
|
||||
# define ROCP_REG_VERSION \
|
||||
ROCPROFILER_REGISTER_COMPUTE_VERSION_3(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH)
|
||||
|
||||
ROCPROFILER_REGISTER_DEFINE_IMPORT(rccl, ROCP_REG_VERSION)
|
||||
#endif
|
||||
|
||||
ncclResult_t
|
||||
ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
|
||||
|
||||
ncclResult_t
|
||||
ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm,
|
||||
cudaStream_t stream);
|
||||
|
||||
ncclResult_t
|
||||
ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
ncclResult_t
|
||||
ncclAllToAllv_impl(const void* sendbuff, const size_t sendcounts[],
|
||||
const size_t sdispls[], void* recvbuff, const size_t recvcounts[],
|
||||
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
|
||||
ncclResult_t
|
||||
ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
cudaStream_t stream);
|
||||
|
||||
ncclResult_t
|
||||
ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
ncclResult_t
|
||||
ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm,
|
||||
cudaStream_t stream);
|
||||
|
||||
ncclResult_t
|
||||
ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm,
|
||||
cudaStream_t stream);
|
||||
|
||||
ncclResult_t
|
||||
ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
ncclResult_t
|
||||
ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream);
|
||||
|
||||
ncclResult_t
|
||||
ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream);
|
||||
|
||||
ncclResult_t
|
||||
ncclRedOpCreatePreMulSum_impl(ncclRedOp_t* op, void* scalar, ncclDataType_t datatype,
|
||||
ncclScalarResidence_t residence, ncclComm_t comm);
|
||||
|
||||
ncclResult_t
|
||||
ncclRedOpDestroy_impl(ncclRedOp_t op, ncclComm_t comm);
|
||||
|
||||
ncclResult_t
|
||||
ncclGroupStart_impl();
|
||||
|
||||
ncclResult_t
|
||||
ncclGroupEnd_impl();
|
||||
|
||||
ncclResult_t
|
||||
ncclGetVersion_impl(int* version);
|
||||
|
||||
ncclResult_t
|
||||
ncclGetUniqueId_impl(ncclUniqueId* out);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlist);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommInitRankConfig_impl(ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank,
|
||||
ncclConfig_t* config);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommFinalize_impl(ncclComm_t comm);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommDestroy_impl(ncclComm_t comm);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommAbort_impl(ncclComm_t comm);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t* newcomm,
|
||||
ncclConfig_t* config);
|
||||
|
||||
const char*
|
||||
ncclGetErrorString_impl(ncclResult_t code);
|
||||
|
||||
const char*
|
||||
ncclGetLastError_impl(const ncclComm_t comm);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommGetAsyncError_impl(ncclComm_t comm, ncclResult_t* asyncError);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommCount_impl(const ncclComm_t comm, int* count);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommCuDevice_impl(const ncclComm_t comm, int* devid);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommUserRank_impl(const ncclComm_t comm, int* rank);
|
||||
|
||||
ncclResult_t
|
||||
ncclMemAlloc_impl(void** ptr, size_t size);
|
||||
|
||||
ncclResult_t
|
||||
ncclMemFree_impl(void* ptr);
|
||||
|
||||
ncclResult_t
|
||||
mscclLoadAlgo_impl(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle,
|
||||
int rank);
|
||||
|
||||
ncclResult_t
|
||||
mscclRunAlgo_impl(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
|
||||
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
|
||||
size_t count, ncclDataType_t dataType, int root, int peer,
|
||||
ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
|
||||
ncclResult_t
|
||||
mscclUnloadAlgo_impl(mscclAlgoHandle_t mscclAlgoHandle);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommDeregister_impl(const ncclComm_t comm, void* handle);
|
||||
|
||||
namespace rccl
|
||||
{
|
||||
namespace
|
||||
{
|
||||
|
||||
constexpr size_t
|
||||
compute_table_offset(size_t n)
|
||||
{
|
||||
return (sizeof(uint64_t) + (n * sizeof(void*)));
|
||||
}
|
||||
|
||||
constexpr size_t
|
||||
compute_table_size(size_t nmembers)
|
||||
{
|
||||
return (sizeof(uint64_t) + (nmembers * sizeof(void*)));
|
||||
}
|
||||
|
||||
#define RCCL_ASSERT_OFFSET(TABLE, MEMBER, IDX) \
|
||||
static_assert(offsetof(TABLE, MEMBER) == compute_table_offset(IDX), \
|
||||
"Do not re-arrange the table members")
|
||||
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllGather_fn, 0);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllReduce_fn, 1);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllToAll_fn, 2);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllToAllv_fn, 3);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclBroadcast_fn, 4);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGather_fn, 5);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclReduce_fn, 6);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclReduceScatter_fn, 7);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclScatter_fn, 8);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclSend_fn, 9);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclRecv_fn, 10);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclRedOpCreatePreMulSum_fn, 11);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclRedOpDestroy_fn, 12);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGroupStart_fn, 13);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGroupEnd_fn, 14);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetVersion_fn, 15);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetUniqueId_fn, 16);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommInitRank_fn, 17);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommInitAll_fn, 18);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommInitRankConfig_fn, 19);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommFinalize_fn, 20);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommDestroy_fn, 21);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommAbort_fn, 22);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommSplit_fn, 23);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetErrorString_fn, 24);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetLastError_fn, 25);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommGetAsyncError_fn, 26);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommCount_fn, 27);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommCuDevice_fn, 28);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommUserRank_fn, 29);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclMemAlloc_fn, 30);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclMemFree_fn, 31);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclLoadAlgo_fn, 32);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclRunAlgo_fn, 33);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclUnloadAlgo_fn, 34);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommRegister_fn, 35);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommDeregister_fn, 36);
|
||||
|
||||
#undef RCCL_ASSERT_OFFSET
|
||||
|
||||
static_assert(sizeof(rcclApiFuncTable) == compute_table_size(37),
|
||||
"Update table major/step version and add a new offset assertion if this "
|
||||
"fails to compile");
|
||||
|
||||
std::array<unsigned char, sizeof(rcclApiFuncTable)> m_buffer = {};
|
||||
|
||||
rcclApiFuncTable*
|
||||
RcclGetFunctionTable_impl()
|
||||
{
|
||||
static auto* tbl =
|
||||
new(m_buffer.data()) rcclApiFuncTable{ sizeof(rcclApiFuncTable),
|
||||
&ncclAllGather_impl,
|
||||
&ncclAllReduce_impl,
|
||||
&ncclAllToAll_impl,
|
||||
&ncclAllToAllv_impl,
|
||||
&ncclBroadcast_impl,
|
||||
&ncclGather_impl,
|
||||
&ncclReduce_impl,
|
||||
&ncclReduceScatter_impl,
|
||||
&ncclScatter_impl,
|
||||
&ncclSend_impl,
|
||||
&ncclRecv_impl,
|
||||
&ncclRedOpCreatePreMulSum_impl,
|
||||
&ncclRedOpDestroy_impl,
|
||||
&ncclGroupStart_impl,
|
||||
&ncclGroupEnd_impl,
|
||||
&ncclGetVersion_impl,
|
||||
&ncclGetUniqueId_impl,
|
||||
&ncclCommInitRank_impl,
|
||||
&ncclCommInitAll_impl,
|
||||
&ncclCommInitRankConfig_impl,
|
||||
&ncclCommFinalize_impl,
|
||||
&ncclCommDestroy_impl,
|
||||
&ncclCommAbort_impl,
|
||||
&ncclCommSplit_impl,
|
||||
&ncclGetErrorString_impl,
|
||||
&ncclGetLastError_impl,
|
||||
&ncclCommGetAsyncError_impl,
|
||||
&ncclCommCount_impl,
|
||||
&ncclCommCuDevice_impl,
|
||||
&ncclCommUserRank_impl,
|
||||
&ncclMemAlloc_impl,
|
||||
&ncclMemFree_impl,
|
||||
&mscclLoadAlgo_impl,
|
||||
&mscclRunAlgo_impl,
|
||||
&mscclUnloadAlgo_impl,
|
||||
&ncclCommRegister_impl,
|
||||
&ncclCommDeregister_impl };
|
||||
|
||||
#if defined(RCCL_ROCPROFILER_REGISTER) && RCCL_ROCPROFILER_REGISTER > 0
|
||||
std::array<void*, 1> table_array{ tbl };
|
||||
rocprofiler_register_library_indentifier_t lib_id =
|
||||
rocprofiler_register_library_indentifier_t{};
|
||||
rocprofiler_register_error_code_t rocp_reg_status =
|
||||
rocprofiler_register_library_api_table(
|
||||
"rccl", &ROCPROFILER_REGISTER_IMPORT_FUNC(rccl), ROCP_REG_VERSION,
|
||||
table_array.data(), table_array.size(), &lib_id);
|
||||
|
||||
INFO(NCCL_COLL,
|
||||
"[rocprofiler-sdk-rccl][ = %d ] rocprofiler-register returned code = %d : %s",
|
||||
getpid(), rocp_reg_status, rocprofiler_register_error_string(rocp_reg_status));
|
||||
|
||||
if(rocp_reg_status != ROCP_REG_SUCCESS && rocp_reg_status != ROCP_REG_NO_TOOLS)
|
||||
WARN("[rocprofiler-sdk-rccl][%d] rocprofiler-register failed with error code %d "
|
||||
": %s",
|
||||
getpid(), rocp_reg_status,
|
||||
rocprofiler_register_error_string(rocp_reg_status));
|
||||
#endif
|
||||
|
||||
return tbl;
|
||||
}
|
||||
} // end of namespace
|
||||
|
||||
const rcclApiFuncTable*
|
||||
RcclGetFunctionTable()
|
||||
{
|
||||
static const auto* tbl = RcclGetFunctionTable_impl();
|
||||
return tbl;
|
||||
}
|
||||
} // end of namespace rccl
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff,
|
||||
size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllToAllv, const void* sendbuff, const size_t sendcounts[],
|
||||
const size_t sdispls[], void* recvbuff, const size_t recvcounts[],
|
||||
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm,
|
||||
hipStream_t stream);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count,
|
||||
ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
int peer, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t* op, void* scalar,
|
||||
ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclRedOpDestroy, ncclRedOp_t op, ncclComm_t comm);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupStart);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupEnd);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGetVersion, int* version);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks,
|
||||
ncclUniqueId commId, int myrank);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks,
|
||||
ncclUniqueId commId, int myrank, ncclConfig_t* config);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key,
|
||||
ncclComm_t* newcomm, ncclConfig_t* config);
|
||||
|
||||
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
|
||||
|
||||
NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t* asyncError);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclMemAlloc, void** ptr, size_t size);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclMemFree, void* ptr);
|
||||
|
||||
NCCL_API(ncclResult_t, mscclLoadAlgo, const char* mscclAlgoFilePath,
|
||||
mscclAlgoHandle_t* mscclAlgoHandle, int rank);
|
||||
|
||||
NCCL_API(ncclResult_t, mscclRunAlgo, const void* sendBuff, const size_t sendCounts[],
|
||||
const size_t sDisPls[], void* recvBuff, const size_t recvCounts[],
|
||||
const size_t rDisPls[], size_t count, ncclDataType_t dataType, int root,
|
||||
int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
|
||||
NCCL_API(ncclResult_t, mscclUnloadAlgo, mscclAlgoHandle_t mscclAlgoHandle);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size,
|
||||
void** handle);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
|
||||
|
||||
ncclResult_t
|
||||
ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclAllGather_fn(sendbuff, recvbuff, sendcount,
|
||||
datatype, comm, stream);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, ncclComm* comm, cudaStream_t stream)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclAllReduce_fn(sendbuff, recvbuff, count,
|
||||
datatype, op, comm, stream);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclComm_t comm, hipStream_t stream)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclAllToAll_fn(sendbuff, recvbuff, count,
|
||||
datatype, comm, stream);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclAllToAllv(const void* sendbuff, const size_t sendcounts[], const size_t sdispls[],
|
||||
void* recvbuff, const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclAllToAllv_fn(sendbuff, sendcounts, sdispls,
|
||||
recvbuff, recvcounts, rdispls,
|
||||
datatype, comm, stream);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
int root, ncclComm_t comm, cudaStream_t stream)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclBroadcast_fn(sendbuff, recvbuff, count,
|
||||
datatype, root, comm, stream);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclGather_fn(sendbuff, recvbuff, sendcount,
|
||||
datatype, root, comm, stream);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclReduce_fn(
|
||||
sendbuff, recvbuff, count, datatype, op, root, comm, stream);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclReduceScatter_fn(
|
||||
sendbuff, recvbuff, recvcount, datatype, op, comm, stream);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclScatter_fn(sendbuff, recvbuff, recvcount,
|
||||
datatype, root, comm, stream);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclSend_fn(sendbuff, count, datatype, peer,
|
||||
comm, stream);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclRecv_fn(recvbuff, count, datatype, peer,
|
||||
comm, stream);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, ncclDataType_t datatype,
|
||||
ncclScalarResidence_t residence, ncclComm_t comm)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclRedOpCreatePreMulSum_fn(
|
||||
op, scalar, datatype, residence, comm);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclRedOpDestroy_fn(op, comm);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclGroupStart()
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclGroupStart_fn();
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclGroupEnd()
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclGroupEnd_fn();
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclGetVersion(int* version)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclGetVersion_fn(version);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclGetUniqueId(ncclUniqueId* out)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclGetUniqueId_fn(out);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommInitRank_fn(newcomm, nranks, commId,
|
||||
myrank);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommInitAll_fn(comms, ndev, devlist);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank,
|
||||
ncclConfig_t* config)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommInitRankConfig_fn(comm, nranks, commId,
|
||||
myrank, config);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommFinalize(ncclComm_t comm)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommFinalize_fn(comm);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommDestroy(ncclComm_t comm)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommDestroy_fn(comm);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommAbort(ncclComm_t comm)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommAbort_fn(comm);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm,
|
||||
ncclConfig_t* config)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommSplit_fn(comm, color, key, newcomm,
|
||||
config);
|
||||
}
|
||||
|
||||
const char*
|
||||
ncclGetErrorString(ncclResult_t code)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclGetErrorString_fn(code);
|
||||
}
|
||||
|
||||
const char*
|
||||
ncclGetLastError(const ncclComm_t comm)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclGetLastError_fn(comm);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommGetAsyncError_fn(comm, asyncError);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommCount(const ncclComm_t comm, int* count)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommCount_fn(comm, count);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommCuDevice(const ncclComm_t comm, int* devid)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommCuDevice_fn(comm, devid);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommUserRank(const ncclComm_t comm, int* rank)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommUserRank_fn(comm, rank);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclMemAlloc(void** ptr, size_t size)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclMemAlloc_fn(ptr, size);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclMemFree(void* ptr)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclMemFree_fn(ptr);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
mscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->mscclLoadAlgo_fn(mscclAlgoFilePath,
|
||||
mscclAlgoHandle, rank);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
mscclRunAlgo(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
|
||||
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
|
||||
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
|
||||
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->mscclRunAlgo_fn(
|
||||
sendBuff, sendCounts, sDisPls, recvBuff, recvCounts, rDisPls, count, dataType,
|
||||
root, peer, op, mscclAlgoHandle, comm, stream);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->mscclUnloadAlgo_fn(mscclAlgoHandle);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommRegister_fn(comm, buff, size, handle);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommDeregister(const ncclComm_t comm, void* handle)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommDeregister_fn(comm, handle);
|
||||
}
|
||||
+4
-3
@@ -7,11 +7,12 @@
|
||||
#include "msccl/msccl_parser.h"
|
||||
#include "msccl/msccl_setup.h"
|
||||
#include "msccl/msccl_status.h"
|
||||
#include "api_trace.h"
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
NCCL_API(ncclResult_t, mscclLoadAlgo, const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
|
||||
ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank) {
|
||||
ncclResult_t mscclLoadAlgo_impl(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank) {
|
||||
mscclStatus& status = mscclGetStatus(rank);
|
||||
|
||||
if (status.freeAlgoHandles.size() == 0) {
|
||||
@@ -39,7 +40,7 @@ NCCL_API(ncclResult_t, mscclRunAlgo,
|
||||
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
|
||||
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
|
||||
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t mscclRunAlgo(
|
||||
ncclResult_t mscclRunAlgo_impl(
|
||||
const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
|
||||
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
|
||||
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
|
||||
@@ -76,7 +77,7 @@ ncclResult_t mscclRunAlgo(
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, mscclUnloadAlgo, mscclAlgoHandle_t mscclAlgoHandle);
|
||||
ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle) {
|
||||
ncclResult_t mscclUnloadAlgo_impl(mscclAlgoHandle_t mscclAlgoHandle) {
|
||||
// deprecated
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+3
-2
@@ -9,6 +9,7 @@
|
||||
#include "comm.h"
|
||||
#include "net.h"
|
||||
#include "register.h"
|
||||
#include "api_trace.h"
|
||||
|
||||
ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
@@ -151,7 +152,7 @@ ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
|
||||
ncclResult_t ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t size, void** handle) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
|
||||
if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
|
||||
NCCLCHECK(ncclRegister(comm, buff, size, handle));
|
||||
@@ -159,7 +160,7 @@ ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, vo
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
|
||||
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
|
||||
ncclResult_t ncclCommDeregister_impl(const ncclComm_t comm, void* handle) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
|
||||
struct ncclReg* reg = (struct ncclReg*)handle;
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "PtrUnion.hpp"
|
||||
|
||||
#include "api_trace.h"
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
size_t DataTypeToBytes(ncclDataType_t const dataType)
|
||||
|
||||
مرجع در شماره جدید
Block a user