adding all nccl apis to api_support to enable rccl tracing by rocprofv3 (#1297)

* adding all nccl apis to api_support to enable rccl tracing by rocprofv3

Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com>
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
This commit is contained in:
mberenjk
2024-08-22 12:36:07 -05:00
zatwierdzone przez GitHub
rodzic 93b0c7418f
commit db840f024e
11 zmienionych plików z 945 dodań i 38 usunięć
+20
Wyświetl plik
@@ -386,6 +386,7 @@ set(SRC_FILES
src/include/align.h
src/include/alloc.h
src/include/archinfo.h
src/include/api_trace.h
src/include/argcheck.h
src/include/BfdBacktrace.hpp
src/include/bootstrap.h
@@ -473,6 +474,8 @@ set(SRC_FILES
src/misc/alt_rsmi.cc
src/misc/archinfo.cc
src/misc/argcheck.cc
src/misc/api_trace.c
src/misc/api_trace.cc
# src/misc/cudawrap.cc
# src/misc/gdrwrap.cc
src/misc/ibvsymbols.cc
@@ -691,6 +694,21 @@ endif()
## Set RCCL linked library directories
target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR})
if (ROCM_VERSION VERSION_GREATER_EQUAL "60100")
option(RCCL_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
else()
if(RCCL_ROCPROFILER_REGISTER)
message(AUTHOR_WARNING "RCCL_ROCPROFILER_REGISTER is not valid option for ROCm < 6.2. Current ROCm version: ${ROCM_VERSION}")
endif()
set(RCCL_ROCPROFILER_REGISTER OFF CACHE BOOL "" FORCE)
endif()
if(RCCL_ROCPROFILER_REGISTER)
find_package(rocprofiler-register REQUIRED)
target_compile_definitions(rccl PRIVATE RCCL_ROCPROFILER_REGISTER=1)
target_link_libraries(
rccl PRIVATE rocprofiler-register::rocprofiler-register)
endif()
## Set RCCL linked libraries
if (HAVE_BFD)
target_link_libraries(rccl PRIVATE bfd)
@@ -766,6 +784,8 @@ rocm_set_soversion(rccl "1.0")
rocm_install_targets(TARGETS rccl)
rocm_install(FILES ${PROJECT_BINARY_DIR}/include/rccl/rccl.h src/include/nccl_net.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl)
rocm_install(FILES src/include/api_trace.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl/amd_detail)
file(COPY tools/msccl-algorithms DESTINATION ${PROJECT_BINARY_DIR})
file(COPY tools/msccl-unit-test-algorithms DESTINATION ${PROJECT_BINARY_DIR})
## Install Algorithm files under share folder
+29 -11
Wyświetl plik
@@ -9,12 +9,14 @@
#include "enqueue.h"
#include "graph/topo.h"
#include "nccl.h"
#include "api_trace.h"
#include "msccl/msccl_lifecycle.h"
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
// Just pass the size of one message and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
@@ -37,7 +39,9 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
struct NvtxParamsAllReduce {
size_t bytes;
@@ -66,7 +70,9 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream) {
// Just pass the size of one message and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllToAllSchema[] = {
@@ -107,7 +113,9 @@ ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, nc
NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
struct NvtxParamsAllToAllv {
@@ -153,7 +161,8 @@ ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], cons
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsBroadcast {
size_t bytes;
@@ -187,7 +196,8 @@ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int ro
NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
struct NvtxParamsGather {
size_t bytes;
@@ -224,7 +234,8 @@ ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsReduce {
size_t bytes;
@@ -254,7 +265,9 @@ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
struct NvtxParamsReduceScatter {
size_t bytes;
@@ -282,7 +295,9 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv
NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
struct NvtxParamsScatter {
size_t bytes;
@@ -328,7 +343,9 @@ constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
@@ -351,7 +368,8 @@ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatyp
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
+3 -2
Wyświetl plik
@@ -20,6 +20,7 @@
#include "rccl_vars.h"
#include "transport.h"
#include "common.h"
#include "api_trace.h"
#include <cassert>
#include <cstring> // std::memcpy
#include <cinttypes> // PRIx64
@@ -2170,7 +2171,7 @@ fail:
}
NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) {
ncclResult_t ncclRedOpCreatePreMulSum_impl(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) {
NCCLCHECK(PtrCheck(comm, "ncclRedOpCreatePreMulSum", "comm"));
/* join init thread before creating PreMulSum op. */
NCCLCHECK(ncclCommEnsureReady(comm));
@@ -2209,7 +2210,7 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp
}
NCCL_API(ncclResult_t, ncclRedOpDestroy, ncclRedOp_t op, ncclComm_t comm);
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) {
ncclResult_t ncclRedOpDestroy_impl(ncclRedOp_t op, ncclComm_t comm) {
if (0 <= int(op) && int(op) < int(ncclNumOps)) {
WARN("ncclRedOpDestroy : operator is a NCCL builtin.");
return ncclInvalidArgument;
+3 -2
Wyświetl plik
@@ -11,6 +11,7 @@
#include "enqueue.h"
#include "transport.h"
#include "channel.h"
#include "api_trace.h"
#include <assert.h>
#include "msccl/msccl_lifecycle.h"
@@ -83,7 +84,7 @@ ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) {
}
NCCL_API(ncclResult_t, ncclGroupStart);
ncclResult_t ncclGroupStart() {
ncclResult_t ncclGroupStart_impl() {
ncclResult_t ret = ncclSuccess;
NVTX3_FUNC_RANGE_IN(nccl_domain);
@@ -101,7 +102,7 @@ ncclResult_t ncclGroupStartInternal() {
}
NCCL_API(ncclResult_t, ncclGroupEnd);
ncclResult_t ncclGroupEnd() {
ncclResult_t ncclGroupEnd_impl() {
ncclResult_t ret = ncclSuccess;
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECKGOTO(ncclGroupEndInternal(), ret, exit);
+200
Wyświetl plik
@@ -0,0 +1,200 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "nccl.h"
#include <stddef.h>
#include <stdint.h>
// should only be increased if fundamental changes to dispatch table(s)
#define RCCL_API_TRACE_VERSION_MAJOR 0
// should be increased every time new members are added to existing dispatch tables
#define RCCL_API_TRACE_VERSION_PATCH 0
#if !defined(RCCL_EXTERN_C_INIT)
# ifdef __cplusplus
# define RCCL_EXTERN_C_INIT \
extern "C" \
{
# else
# define RCCL_EXTERN_C_INIT
# endif
#endif
#if !defined(RCCL_EXTERN_C_FINI)
# ifdef __cplusplus
# define RCCL_EXTERN_C_FINI }
# else
# define RCCL_EXTERN_C_FINI
# endif
#endif
RCCL_EXTERN_C_INIT
typedef uint64_t rccl_range_id_t;
typedef ncclResult_t (*ncclAllGather_fn_t)(const void* sendbuff, void* recvbuff,
size_t sendcount, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream);
typedef ncclResult_t (*ncclAllReduce_fn_t)(const void* sendbuff, void* recvbuff,
size_t count, ncclDataType_t datatype,
ncclRedOp_t op, struct ncclComm* comm,
cudaStream_t stream);
typedef ncclResult_t (*ncclAllToAll_fn_t)(const void* sendbuff, void* recvbuff,
size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream);
typedef ncclResult_t (*ncclAllToAllv_fn_t)(
const void* sendbuff, const size_t sendcounts[], const size_t sdispls[],
void* recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
typedef ncclResult_t (*ncclBroadcast_fn_t)(const void* sendbuff, void* recvbuff,
size_t count, ncclDataType_t datatype,
int root, ncclComm_t comm,
cudaStream_t stream);
typedef ncclResult_t (*ncclGather_fn_t)(const void* sendbuff, void* recvbuff,
size_t sendcount, ncclDataType_t datatype,
int root, ncclComm_t comm, hipStream_t stream);
typedef ncclResult_t (*ncclReduce_fn_t)(const void* sendbuff, void* recvbuff,
size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm,
cudaStream_t stream);
typedef ncclResult_t (*ncclReduceScatter_fn_t)(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype,
ncclRedOp_t op, struct ncclComm* comm,
cudaStream_t stream);
typedef ncclResult_t (*ncclScatter_fn_t)(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype,
int root, ncclComm_t comm, hipStream_t stream);
typedef ncclResult_t (*ncclSend_fn_t)(const void* sendbuff, size_t count,
ncclDataType_t datatype, int peer, ncclComm_t comm,
cudaStream_t stream);
typedef ncclResult_t (*ncclRecv_fn_t)(void* recvbuff, size_t count,
ncclDataType_t datatype, int peer, ncclComm_t comm,
cudaStream_t stream);
typedef ncclResult_t (*ncclRedOpCreatePreMulSum_fn_t)(ncclRedOp_t* op, void* scalar,
ncclDataType_t datatype,
ncclScalarResidence_t residence,
ncclComm_t comm);
typedef ncclResult_t (*ncclRedOpDestroy_fn_t)(ncclRedOp_t op, ncclComm_t comm);
typedef ncclResult_t (*ncclGroupStart_fn_t)();
typedef ncclResult_t (*ncclGroupEnd_fn_t)();
typedef ncclResult_t (*ncclGetVersion_fn_t)(int* version);
typedef ncclResult_t (*ncclGetUniqueId_fn_t)(ncclUniqueId* out);
typedef ncclResult_t (*ncclCommInitRank_fn_t)(ncclComm_t* newcomm, int nranks,
ncclUniqueId commId, int myrank);
typedef ncclResult_t (*ncclCommInitAll_fn_t)(ncclComm_t* comms, int ndev,
const int* devlist);
typedef ncclResult_t (*ncclCommInitRankConfig_fn_t)(ncclComm_t* comm, int nranks,
ncclUniqueId commId, int myrank,
ncclConfig_t* config);
typedef ncclResult_t (*ncclCommFinalize_fn_t)(ncclComm_t comm);
typedef ncclResult_t (*ncclCommDestroy_fn_t)(ncclComm_t comm);
typedef ncclResult_t (*ncclCommAbort_fn_t)(ncclComm_t comm);
typedef ncclResult_t (*ncclCommSplit_fn_t)(ncclComm_t comm, int color, int key,
ncclComm_t* newcomm, ncclConfig_t* config);
typedef const char* (*ncclGetErrorString_fn_t)(ncclResult_t code);
typedef const char* (*ncclGetLastError_fn_t)(const ncclComm_t comm);
typedef ncclResult_t (*ncclCommGetAsyncError_fn_t)(ncclComm_t comm,
ncclResult_t* asyncError);
typedef ncclResult_t (*ncclCommCount_fn_t)(const ncclComm_t comm, int* count);
typedef ncclResult_t (*ncclCommCuDevice_fn_t)(const ncclComm_t comm, int* devid);
typedef ncclResult_t (*ncclCommUserRank_fn_t)(const ncclComm_t comm, int* rank);
typedef ncclResult_t (*ncclMemAlloc_fn_t)(void** ptr, size_t size);
typedef ncclResult_t (*ncclMemFree_fn_t)(void* ptr);
typedef ncclResult_t (*mscclLoadAlgo_fn_t)(const char* mscclAlgoFilePath,
mscclAlgoHandle_t* mscclAlgoHandle, int rank);
typedef ncclResult_t (*mscclRunAlgo_fn_t)(
const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], size_t count,
ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
typedef ncclResult_t (*mscclUnloadAlgo_fn_t)(mscclAlgoHandle_t mscclAlgoHandle);
typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm, void* buff,
size_t size, void** handle);
typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, void* handle);
typedef struct rcclApiFuncTable
{
uint64_t size;
ncclAllGather_fn_t ncclAllGather_fn;
ncclAllReduce_fn_t ncclAllReduce_fn;
ncclAllToAll_fn_t ncclAllToAll_fn;
ncclAllToAllv_fn_t ncclAllToAllv_fn;
ncclBroadcast_fn_t ncclBroadcast_fn;
ncclGather_fn_t ncclGather_fn;
ncclReduce_fn_t ncclReduce_fn;
ncclReduceScatter_fn_t ncclReduceScatter_fn;
ncclScatter_fn_t ncclScatter_fn;
ncclSend_fn_t ncclSend_fn;
ncclRecv_fn_t ncclRecv_fn;
ncclRedOpCreatePreMulSum_fn_t ncclRedOpCreatePreMulSum_fn;
ncclRedOpDestroy_fn_t ncclRedOpDestroy_fn;
ncclGroupStart_fn_t ncclGroupStart_fn;
ncclGroupEnd_fn_t ncclGroupEnd_fn;
ncclGetVersion_fn_t ncclGetVersion_fn;
ncclGetUniqueId_fn_t ncclGetUniqueId_fn;
ncclCommInitRank_fn_t ncclCommInitRank_fn;
ncclCommInitAll_fn_t ncclCommInitAll_fn;
ncclCommInitRankConfig_fn_t ncclCommInitRankConfig_fn;
ncclCommFinalize_fn_t ncclCommFinalize_fn;
ncclCommDestroy_fn_t ncclCommDestroy_fn;
ncclCommAbort_fn_t ncclCommAbort_fn;
ncclCommSplit_fn_t ncclCommSplit_fn;
ncclGetErrorString_fn_t ncclGetErrorString_fn;
ncclGetLastError_fn_t ncclGetLastError_fn;
ncclCommGetAsyncError_fn_t ncclCommGetAsyncError_fn;
ncclCommCount_fn_t ncclCommCount_fn;
ncclCommCuDevice_fn_t ncclCommCuDevice_fn;
ncclCommUserRank_fn_t ncclCommUserRank_fn;
ncclMemAlloc_fn_t ncclMemAlloc_fn;
ncclMemFree_fn_t ncclMemFree_fn;
mscclLoadAlgo_fn_t mscclLoadAlgo_fn;
mscclRunAlgo_fn_t mscclRunAlgo_fn;
mscclUnloadAlgo_fn_t mscclUnloadAlgo_fn;
ncclCommRegister_fn_t ncclCommRegister_fn;
ncclCommDeregister_fn_t ncclCommDeregister_fn;
} rcclApiFuncTable;
RCCL_EXTERN_C_FINI
+17 -17
Wyświetl plik
@@ -178,14 +178,14 @@ static ncclResult_t ncclInit() {
}
NCCL_API(ncclResult_t, ncclGetVersion, int* version);
ncclResult_t ncclGetVersion(int* version) {
ncclResult_t ncclGetVersion_impl(int* version) {
if (version == NULL) return ncclInvalidArgument;
*version = NCCL_VERSION_CODE;
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
ncclResult_t ncclGetUniqueId_impl(ncclUniqueId* out) {
NCCLCHECK(ncclInit());
NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
ncclResult_t res = bootstrapGetUniqueId((struct ncclBootstrapHandle*)out);
@@ -2255,7 +2255,7 @@ constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = {
};
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
ncclResult_t ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
rocmLibraryInit();
@@ -2271,7 +2271,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
}
NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlist) {
ncclResult_t ret = ncclSuccess;
int totalnDev;
int *gpuFlags = NULL;
@@ -2339,7 +2339,7 @@ ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
}
NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config);
ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) {
ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
int cudaDev;
ncclResult_t ret = ncclSuccess;
@@ -2462,7 +2462,7 @@ fail:
}
NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm);
ncclResult_t ncclCommFinalize(ncclComm_t comm) {
ncclResult_t ncclCommFinalize_impl(ncclComm_t comm) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
ncclResult_t ret = ncclSuccess;
@@ -2576,7 +2576,7 @@ fail:
}
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
ncclResult_t ncclCommDestroy(ncclComm_t comm) {
ncclResult_t ncclCommDestroy_impl(ncclComm_t comm) {
if (comm == NULL) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
return ncclSuccess;
@@ -2617,7 +2617,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
}
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
ncclResult_t ncclCommAbort(ncclComm_t comm) {
ncclResult_t ncclCommAbort_impl(ncclComm_t comm) {
if (comm == NULL) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
return ncclSuccess;
@@ -2649,7 +2649,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
}
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
ncclResult_t ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
struct ncclCommInitRankAsyncJob *job = NULL;
struct ncclComm* childComm = NCCL_COMM_NULL;
ncclResult_t res = ncclSuccess;
@@ -2713,7 +2713,7 @@ fail:
}
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
const char* ncclGetErrorString(ncclResult_t code) {
const char* ncclGetErrorString_impl(ncclResult_t code) {
switch (code) {
case ncclSuccess : return "no error";
case ncclUnhandledCudaError : return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)";
@@ -2731,12 +2731,12 @@ const char* ncclGetErrorString(ncclResult_t code) {
* comm is currently unused and can be set to NULL
*/
NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm);
const char* ncclGetLastError(ncclComm_t comm) {
const char* ncclGetLastError_impl(ncclComm_t comm) {
return ncclLastError;
}
NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
ncclResult_t ncclCommGetAsyncError_impl(ncclComm_t comm, ncclResult_t *asyncError) {
NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
@@ -2746,7 +2746,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
}
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
ncclResult_t ncclCommCount_impl(const ncclComm_t comm, int* count) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
@@ -2760,7 +2760,7 @@ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
}
NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
ncclResult_t ncclCommCuDevice_impl(const ncclComm_t comm, int* devid) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
@@ -2773,7 +2773,7 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
}
NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
ncclResult_t ncclCommUserRank_impl(const ncclComm_t comm, int* rank) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
@@ -2786,7 +2786,7 @@ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
}
NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
ncclResult_t ret = ncclSuccess;
@@ -2863,7 +2863,7 @@ fail:
}
NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
ncclResult_t ncclMemFree(void *ptr) {
ncclResult_t ncclMemFree_impl(void *ptr) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
ncclResult_t ret = ncclSuccess;
int saveDevice;
+9
Wyświetl plik
@@ -0,0 +1,9 @@
//
// This file just ensures that api_trace.h is C-compatible
//
#if defined(__cplusplus)
# error "C source file compiling as C++"
#endif
#include "api_trace.h"
+656
Wyświetl plik
@@ -0,0 +1,656 @@
#include "api_trace.h"
#include "core.h"
#include "nccl.h"
#include <cstddef>
#include <cstdint>
#include <iostream>
#include <vector>
#if defined(RCCL_ROCPROFILER_REGISTER) && RCCL_ROCPROFILER_REGISTER > 0
# include <rocprofiler-register/rocprofiler-register.h>
# define ROCP_REG_VERSION \
ROCPROFILER_REGISTER_COMPUTE_VERSION_3(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH)
ROCPROFILER_REGISTER_DEFINE_IMPORT(rccl, ROCP_REG_VERSION)
#endif
ncclResult_t
ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t
ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm,
cudaStream_t stream);
ncclResult_t
ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t
ncclAllToAllv_impl(const void* sendbuff, const size_t sendcounts[],
const size_t sdispls[], void* recvbuff, const size_t recvcounts[],
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
ncclResult_t
ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm_t comm,
cudaStream_t stream);
ncclResult_t
ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t
ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm,
cudaStream_t stream);
ncclResult_t
ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm,
cudaStream_t stream);
ncclResult_t
ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t
ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t
ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t
ncclRedOpCreatePreMulSum_impl(ncclRedOp_t* op, void* scalar, ncclDataType_t datatype,
ncclScalarResidence_t residence, ncclComm_t comm);
ncclResult_t
ncclRedOpDestroy_impl(ncclRedOp_t op, ncclComm_t comm);
ncclResult_t
ncclGroupStart_impl();
ncclResult_t
ncclGroupEnd_impl();
ncclResult_t
ncclGetVersion_impl(int* version);
ncclResult_t
ncclGetUniqueId_impl(ncclUniqueId* out);
ncclResult_t
ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
ncclResult_t
ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlist);
ncclResult_t
ncclCommInitRankConfig_impl(ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank,
ncclConfig_t* config);
ncclResult_t
ncclCommFinalize_impl(ncclComm_t comm);
ncclResult_t
ncclCommDestroy_impl(ncclComm_t comm);
ncclResult_t
ncclCommAbort_impl(ncclComm_t comm);
ncclResult_t
ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t* newcomm,
ncclConfig_t* config);
const char*
ncclGetErrorString_impl(ncclResult_t code);
const char*
ncclGetLastError_impl(const ncclComm_t comm);
ncclResult_t
ncclCommGetAsyncError_impl(ncclComm_t comm, ncclResult_t* asyncError);
ncclResult_t
ncclCommCount_impl(const ncclComm_t comm, int* count);
ncclResult_t
ncclCommCuDevice_impl(const ncclComm_t comm, int* devid);
ncclResult_t
ncclCommUserRank_impl(const ncclComm_t comm, int* rank);
ncclResult_t
ncclMemAlloc_impl(void** ptr, size_t size);
ncclResult_t
ncclMemFree_impl(void* ptr);
ncclResult_t
mscclLoadAlgo_impl(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle,
int rank);
ncclResult_t
mscclRunAlgo_impl(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
size_t count, ncclDataType_t dataType, int root, int peer,
ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm,
hipStream_t stream);
ncclResult_t
mscclUnloadAlgo_impl(mscclAlgoHandle_t mscclAlgoHandle);
ncclResult_t
ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t size, void** handle);
ncclResult_t
ncclCommDeregister_impl(const ncclComm_t comm, void* handle);
namespace rccl
{
namespace
{
constexpr size_t
compute_table_offset(size_t n)
{
return (sizeof(uint64_t) + (n * sizeof(void*)));
}
constexpr size_t
compute_table_size(size_t nmembers)
{
return (sizeof(uint64_t) + (nmembers * sizeof(void*)));
}
#define RCCL_ASSERT_OFFSET(TABLE, MEMBER, IDX) \
static_assert(offsetof(TABLE, MEMBER) == compute_table_offset(IDX), \
"Do not re-arrange the table members")
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllGather_fn, 0);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllReduce_fn, 1);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllToAll_fn, 2);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllToAllv_fn, 3);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclBroadcast_fn, 4);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGather_fn, 5);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclReduce_fn, 6);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclReduceScatter_fn, 7);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclScatter_fn, 8);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclSend_fn, 9);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclRecv_fn, 10);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclRedOpCreatePreMulSum_fn, 11);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclRedOpDestroy_fn, 12);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGroupStart_fn, 13);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGroupEnd_fn, 14);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetVersion_fn, 15);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetUniqueId_fn, 16);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommInitRank_fn, 17);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommInitAll_fn, 18);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommInitRankConfig_fn, 19);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommFinalize_fn, 20);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommDestroy_fn, 21);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommAbort_fn, 22);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommSplit_fn, 23);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetErrorString_fn, 24);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetLastError_fn, 25);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommGetAsyncError_fn, 26);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommCount_fn, 27);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommCuDevice_fn, 28);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommUserRank_fn, 29);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclMemAlloc_fn, 30);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclMemFree_fn, 31);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclLoadAlgo_fn, 32);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclRunAlgo_fn, 33);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclUnloadAlgo_fn, 34);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommRegister_fn, 35);
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommDeregister_fn, 36);
#undef RCCL_ASSERT_OFFSET
static_assert(sizeof(rcclApiFuncTable) == compute_table_size(37),
"Update table major/step version and add a new offset assertion if this "
"fails to compile");
std::array<unsigned char, sizeof(rcclApiFuncTable)> m_buffer = {};
rcclApiFuncTable*
RcclGetFunctionTable_impl()
{
static auto* tbl =
new(m_buffer.data()) rcclApiFuncTable{ sizeof(rcclApiFuncTable),
&ncclAllGather_impl,
&ncclAllReduce_impl,
&ncclAllToAll_impl,
&ncclAllToAllv_impl,
&ncclBroadcast_impl,
&ncclGather_impl,
&ncclReduce_impl,
&ncclReduceScatter_impl,
&ncclScatter_impl,
&ncclSend_impl,
&ncclRecv_impl,
&ncclRedOpCreatePreMulSum_impl,
&ncclRedOpDestroy_impl,
&ncclGroupStart_impl,
&ncclGroupEnd_impl,
&ncclGetVersion_impl,
&ncclGetUniqueId_impl,
&ncclCommInitRank_impl,
&ncclCommInitAll_impl,
&ncclCommInitRankConfig_impl,
&ncclCommFinalize_impl,
&ncclCommDestroy_impl,
&ncclCommAbort_impl,
&ncclCommSplit_impl,
&ncclGetErrorString_impl,
&ncclGetLastError_impl,
&ncclCommGetAsyncError_impl,
&ncclCommCount_impl,
&ncclCommCuDevice_impl,
&ncclCommUserRank_impl,
&ncclMemAlloc_impl,
&ncclMemFree_impl,
&mscclLoadAlgo_impl,
&mscclRunAlgo_impl,
&mscclUnloadAlgo_impl,
&ncclCommRegister_impl,
&ncclCommDeregister_impl };
#if defined(RCCL_ROCPROFILER_REGISTER) && RCCL_ROCPROFILER_REGISTER > 0
std::array<void*, 1> table_array{ tbl };
rocprofiler_register_library_indentifier_t lib_id =
rocprofiler_register_library_indentifier_t{};
rocprofiler_register_error_code_t rocp_reg_status =
rocprofiler_register_library_api_table(
"rccl", &ROCPROFILER_REGISTER_IMPORT_FUNC(rccl), ROCP_REG_VERSION,
table_array.data(), table_array.size(), &lib_id);
INFO(NCCL_COLL,
"[rocprofiler-sdk-rccl][ = %d ] rocprofiler-register returned code = %d : %s",
getpid(), rocp_reg_status, rocprofiler_register_error_string(rocp_reg_status));
if(rocp_reg_status != ROCP_REG_SUCCESS && rocp_reg_status != ROCP_REG_NO_TOOLS)
WARN("[rocprofiler-sdk-rccl][%d] rocprofiler-register failed with error code %d "
": %s",
getpid(), rocp_reg_status,
rocprofiler_register_error_string(rocp_reg_status));
#endif
return tbl;
}
} // end of namespace
const rcclApiFuncTable*
RcclGetFunctionTable()
{
static const auto* tbl = RcclGetFunctionTable_impl();
return tbl;
}
} // end of namespace rccl
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff,
size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
NCCL_API(ncclResult_t, ncclAllToAllv, const void* sendbuff, const size_t sendcounts[],
const size_t sdispls[], void* recvbuff, const size_t recvcounts[],
const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm,
hipStream_t stream);
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm,
hipStream_t stream);
NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count,
ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream);
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype,
int peer, ncclComm_t comm, hipStream_t stream);
NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t* op, void* scalar,
ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
NCCL_API(ncclResult_t, ncclRedOpDestroy, ncclRedOp_t op, ncclComm_t comm);
NCCL_API(ncclResult_t, ncclGroupStart);
NCCL_API(ncclResult_t, ncclGroupEnd);
NCCL_API(ncclResult_t, ncclGetVersion, int* version);
NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks,
ncclUniqueId commId, int myrank);
NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks,
ncclUniqueId commId, int myrank, ncclConfig_t* config);
NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm);
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key,
ncclComm_t* newcomm, ncclConfig_t* config);
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm);
NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t* asyncError);
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
NCCL_API(ncclResult_t, ncclMemAlloc, void** ptr, size_t size);
NCCL_API(ncclResult_t, ncclMemFree, void* ptr);
NCCL_API(ncclResult_t, mscclLoadAlgo, const char* mscclAlgoFilePath,
mscclAlgoHandle_t* mscclAlgoHandle, int rank);
NCCL_API(ncclResult_t, mscclRunAlgo, const void* sendBuff, const size_t sendCounts[],
const size_t sDisPls[], void* recvBuff, const size_t recvCounts[],
const size_t rDisPls[], size_t count, ncclDataType_t dataType, int root,
int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm,
hipStream_t stream);
NCCL_API(ncclResult_t, mscclUnloadAlgo, mscclAlgoHandle_t mscclAlgoHandle);
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size,
void** handle);
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
ncclResult_t
ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)
{
return ::rccl::RcclGetFunctionTable()->ncclAllGather_fn(sendbuff, recvbuff, sendcount,
datatype, comm, stream);
}
ncclResult_t
ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, ncclComm* comm, cudaStream_t stream)
{
return ::rccl::RcclGetFunctionTable()->ncclAllReduce_fn(sendbuff, recvbuff, count,
datatype, op, comm, stream);
}
ncclResult_t
ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream)
{
return ::rccl::RcclGetFunctionTable()->ncclAllToAll_fn(sendbuff, recvbuff, count,
datatype, comm, stream);
}
ncclResult_t
ncclAllToAllv(const void* sendbuff, const size_t sendcounts[], const size_t sdispls[],
void* recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream)
{
return ::rccl::RcclGetFunctionTable()->ncclAllToAllv_fn(sendbuff, sendcounts, sdispls,
recvbuff, recvcounts, rdispls,
datatype, comm, stream);
}
ncclResult_t
ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
int root, ncclComm_t comm, cudaStream_t stream)
{
return ::rccl::RcclGetFunctionTable()->ncclBroadcast_fn(sendbuff, recvbuff, count,
datatype, root, comm, stream);
}
ncclResult_t
ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream)
{
return ::rccl::RcclGetFunctionTable()->ncclGather_fn(sendbuff, recvbuff, sendcount,
datatype, root, comm, stream);
}
ncclResult_t
ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
{
return ::rccl::RcclGetFunctionTable()->ncclReduce_fn(
sendbuff, recvbuff, count, datatype, op, root, comm, stream);
}
ncclResult_t
ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm,
cudaStream_t stream)
{
return ::rccl::RcclGetFunctionTable()->ncclReduceScatter_fn(
sendbuff, recvbuff, recvcount, datatype, op, comm, stream);
}
ncclResult_t
ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream)
{
return ::rccl::RcclGetFunctionTable()->ncclScatter_fn(sendbuff, recvbuff, recvcount,
datatype, root, comm, stream);
}
ncclResult_t
ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream)
{
return ::rccl::RcclGetFunctionTable()->ncclSend_fn(sendbuff, count, datatype, peer,
comm, stream);
}
ncclResult_t
ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm,
cudaStream_t stream)
{
return ::rccl::RcclGetFunctionTable()->ncclRecv_fn(recvbuff, count, datatype, peer,
comm, stream);
}
ncclResult_t
ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, ncclDataType_t datatype,
ncclScalarResidence_t residence, ncclComm_t comm)
{
return ::rccl::RcclGetFunctionTable()->ncclRedOpCreatePreMulSum_fn(
op, scalar, datatype, residence, comm);
}
ncclResult_t
ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm)
{
return ::rccl::RcclGetFunctionTable()->ncclRedOpDestroy_fn(op, comm);
}
ncclResult_t
ncclGroupStart()
{
return ::rccl::RcclGetFunctionTable()->ncclGroupStart_fn();
}
ncclResult_t
ncclGroupEnd()
{
return ::rccl::RcclGetFunctionTable()->ncclGroupEnd_fn();
}
ncclResult_t
ncclGetVersion(int* version)
{
return ::rccl::RcclGetFunctionTable()->ncclGetVersion_fn(version);
}
ncclResult_t
ncclGetUniqueId(ncclUniqueId* out)
{
return ::rccl::RcclGetFunctionTable()->ncclGetUniqueId_fn(out);
}
ncclResult_t
ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank)
{
return ::rccl::RcclGetFunctionTable()->ncclCommInitRank_fn(newcomm, nranks, commId,
myrank);
}
ncclResult_t
ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist)
{
return ::rccl::RcclGetFunctionTable()->ncclCommInitAll_fn(comms, ndev, devlist);
}
ncclResult_t
ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank,
ncclConfig_t* config)
{
return ::rccl::RcclGetFunctionTable()->ncclCommInitRankConfig_fn(comm, nranks, commId,
myrank, config);
}
ncclResult_t
ncclCommFinalize(ncclComm_t comm)
{
return ::rccl::RcclGetFunctionTable()->ncclCommFinalize_fn(comm);
}
ncclResult_t
ncclCommDestroy(ncclComm_t comm)
{
return ::rccl::RcclGetFunctionTable()->ncclCommDestroy_fn(comm);
}
ncclResult_t
ncclCommAbort(ncclComm_t comm)
{
return ::rccl::RcclGetFunctionTable()->ncclCommAbort_fn(comm);
}
ncclResult_t
ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm,
ncclConfig_t* config)
{
return ::rccl::RcclGetFunctionTable()->ncclCommSplit_fn(comm, color, key, newcomm,
config);
}
const char*
ncclGetErrorString(ncclResult_t code)
{
return ::rccl::RcclGetFunctionTable()->ncclGetErrorString_fn(code);
}
const char*
ncclGetLastError(const ncclComm_t comm)
{
return ::rccl::RcclGetFunctionTable()->ncclGetLastError_fn(comm);
}
ncclResult_t
ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError)
{
return ::rccl::RcclGetFunctionTable()->ncclCommGetAsyncError_fn(comm, asyncError);
}
ncclResult_t
ncclCommCount(const ncclComm_t comm, int* count)
{
return ::rccl::RcclGetFunctionTable()->ncclCommCount_fn(comm, count);
}
ncclResult_t
ncclCommCuDevice(const ncclComm_t comm, int* devid)
{
return ::rccl::RcclGetFunctionTable()->ncclCommCuDevice_fn(comm, devid);
}
ncclResult_t
ncclCommUserRank(const ncclComm_t comm, int* rank)
{
return ::rccl::RcclGetFunctionTable()->ncclCommUserRank_fn(comm, rank);
}
ncclResult_t
ncclMemAlloc(void** ptr, size_t size)
{
return ::rccl::RcclGetFunctionTable()->ncclMemAlloc_fn(ptr, size);
}
ncclResult_t
ncclMemFree(void* ptr)
{
return ::rccl::RcclGetFunctionTable()->ncclMemFree_fn(ptr);
}
ncclResult_t
mscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank)
{
return ::rccl::RcclGetFunctionTable()->mscclLoadAlgo_fn(mscclAlgoFilePath,
mscclAlgoHandle, rank);
}
ncclResult_t
mscclRunAlgo(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream)
{
return ::rccl::RcclGetFunctionTable()->mscclRunAlgo_fn(
sendBuff, sendCounts, sDisPls, recvBuff, recvCounts, rDisPls, count, dataType,
root, peer, op, mscclAlgoHandle, comm, stream);
}
ncclResult_t
mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle)
{
return ::rccl::RcclGetFunctionTable()->mscclUnloadAlgo_fn(mscclAlgoHandle);
}
ncclResult_t
ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle)
{
return ::rccl::RcclGetFunctionTable()->ncclCommRegister_fn(comm, buff, size, handle);
}
ncclResult_t
ncclCommDeregister(const ncclComm_t comm, void* handle)
{
return ::rccl::RcclGetFunctionTable()->ncclCommDeregister_fn(comm, handle);
}
+4 -3
Wyświetl plik
@@ -7,11 +7,12 @@
#include "msccl/msccl_parser.h"
#include "msccl/msccl_setup.h"
#include "msccl/msccl_status.h"
#include "api_trace.h"
#include <cstdio>
#include <cstdlib>
NCCL_API(ncclResult_t, mscclLoadAlgo, const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank) {
ncclResult_t mscclLoadAlgo_impl(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank) {
mscclStatus& status = mscclGetStatus(rank);
if (status.freeAlgoHandles.size() == 0) {
@@ -39,7 +40,7 @@ NCCL_API(ncclResult_t, mscclRunAlgo,
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
ncclResult_t mscclRunAlgo(
ncclResult_t mscclRunAlgo_impl(
const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
@@ -76,7 +77,7 @@ ncclResult_t mscclRunAlgo(
}
NCCL_API(ncclResult_t, mscclUnloadAlgo, mscclAlgoHandle_t mscclAlgoHandle);
ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle) {
ncclResult_t mscclUnloadAlgo_impl(mscclAlgoHandle_t mscclAlgoHandle) {
// deprecated
return ncclSuccess;
}
+3 -2
Wyświetl plik
@@ -9,6 +9,7 @@
#include "comm.h"
#include "net.h"
#include "register.h"
#include "api_trace.h"
ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
struct ncclRegCache* cache = &comm->regCache;
@@ -151,7 +152,7 @@ ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
}
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
ncclResult_t ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t size, void** handle) {
NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
NCCLCHECK(ncclRegister(comm, buff, size, handle));
@@ -159,7 +160,7 @@ ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, vo
}
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
ncclResult_t ncclCommDeregister_impl(const ncclComm_t comm, void* handle) {
NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
struct ncclReg* reg = (struct ncclReg*)handle;
struct ncclRegCache* cache = &comm->regCache;
+1 -1
Wyświetl plik
@@ -5,7 +5,7 @@
************************************************************************/
#include "PtrUnion.hpp"
#include "api_trace.h"
namespace RcclUnitTesting
{
size_t DataTypeToBytes(ncclDataType_t const dataType)