Merge remote-tracking branch 'nccl/master' into HEAD

[ROCm/rccl commit: e1cb45ff22]
Этот коммит содержится в:
Wenkai Du
2023-02-04 01:43:38 +00:00
родитель f7982e9bed 2840ac0139
Коммит c76bc214c8
62 изменённых файлов: 3799 добавлений и 1447 удалений
+7
Просмотреть файл
@@ -4,6 +4,13 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
## Unreleased
### Changed
- Compatibility with NCCL 2.16.2
### Added
### Fixed
### Removed
## Unreleased - RCCL 2.15.5 for ROCm 5.5.0
### Changed
- Compatibility with NCCL 2.15.5
- Unit test executable renamed to rccl-UnitTests
### Added
+11 -1
Просмотреть файл
@@ -205,12 +205,15 @@ set(HEADER_SOURCES
src/include/param.h
src/include/channel.h
src/include/nvtx_stub.h
src/include/nvtx3.hpp
src/include/core.h
src/include/info.h
src/include/git_version.h
src/include/npkit/npkit_event.h
src/include/npkit/npkit.h
src/include/npkit/npkit_struct.h
src/include/nvtx3/nvToolsExtPayload.h
src/include/nvtx3/nvToolsExt.h
src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
src/include/nvtx3/nvtxDetail/nvtxTypes.h
src/include/nvtx3/nvtxDetail/nvtxImpl.h
@@ -223,6 +226,11 @@ set(HEADER_SOURCES
src/include/nvtx3/nvtxDetail/nvtxInit.h
src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h
src/include/nvtx3/nvToolsExtSync.h
src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h
src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h
src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h
src/include/nvtx3/nvToolsExtCudaRt.h
src/include/nvtx3/nvToolsExtCuda.h
src/include/nvtx3/nvToolsExtOpenCL.h
@@ -372,6 +380,8 @@ configure_file(src/nccl.h.in ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
# Execute git_version_check whenever rccl library is built
add_dependencies(rccl git_version_check)
add_definitions(-DNVTX_NO_IMPL)
if(TRACE)
add_definitions(-DENABLE_TRACE)
endif()
@@ -537,7 +547,7 @@ Source: https://github.com/ROCmSoftwarePlatform/rccl
Files: *
Copyright: (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Modifications Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
License: See LICENSE.txt for license information\n")
install(FILES "${CMAKE_BINARY_DIR}/copyright" DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
# Write changelog file
-1
Просмотреть файл
@@ -5,7 +5,6 @@
#ifndef NCCL_NET_V3_H_
#define NCCL_NET_V3_H_
#define NCCL_NET_HANDLE_MAXSIZE_V3 64
#define NCCL_NET_MAX_REQUESTS_V3 16
typedef ncclNetProperties_v4_t ncclNetProperties_v3_t;
+2
Просмотреть файл
@@ -5,6 +5,8 @@
#ifndef NCCL_NET_V4_H_
#define NCCL_NET_V4_H_
#define NCCL_NET_HANDLE_MAXSIZE_V4 64
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
+10 -6
Просмотреть файл
@@ -10,7 +10,7 @@ VERBOSE ?= 0
KEEP ?= 0
DEBUG ?= 0
TRACE ?= 0
PROFAPI ?= 0
PROFAPI ?= 1
NVTX ?= 1
NVCC = $(CUDA_HOME)/bin/nvcc
@@ -25,22 +25,26 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
# You should define NVCC_GENCODE in your environment to the minimal set
# of archs to reduce compile time.
CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
-gencode=arch=compute_50,code=sm_50 \
CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61
ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
# SM35 is deprecated from CUDA12.0 onwards
CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
endif
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
CUDA11_8_GENCODE = -gencode=arch=compute_90,code=sm_90
CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
CUDA11_8_PTX = -gencode=arch=compute_90,code=compute_90
CUDA12_PTX = -gencode=arch=compute_90,code=compute_90
ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
# Include Hopper support if we're using CUDA11.8 or above
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_8_GENCODE) $(CUDA11_8_PTX)
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
# Include Volta support if we're using CUDA9 or above
+2 -2
Просмотреть файл
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 15
NCCL_PATCH := 5
NCCL_MINOR := 16
NCCL_PATCH := 2
NCCL_SUFFIX :=
PKG_REVISION := 1
+1 -1
Просмотреть файл
@@ -9,7 +9,7 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
+121 -76
Просмотреть файл
@@ -14,6 +14,11 @@
#include "proxy.h"
#include "signals.h" // [RCCL]
struct bootstrapRootArgs {
struct ncclSocket* listenSock;
uint64_t magic;
};
/* Init functions */
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
static union ncclSocketAddress bootstrapNetIfAddr;
@@ -27,7 +32,7 @@ ncclResult_t bootstrapNetInit() {
char* env = getenv("NCCL_COMM_ID");
if (env) {
union ncclSocketAddress remoteAddr;
if (ncclGetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
@@ -90,8 +95,10 @@ static ncclResult_t setFilesLimit() {
return ncclSuccess;
}
static void *bootstrapRoot(void* args) {
struct ncclSocket* listenSock = (struct ncclSocket*)args;
static void *bootstrapRoot(void* rargs) {
struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs;
struct ncclSocket* listenSock = args->listenSock;
uint64_t magic = args->magic;
ncclResult_t res = ncclSuccess;
int nranks = 0, c = 0;
struct extInfo info;
@@ -105,11 +112,10 @@ static void *bootstrapRoot(void* args) {
/* Receive addresses from all ranks */
do {
struct ncclSocket sock;
/* bootstrap root thread always uses blocking ncclSocketAccept. */
NCCLCHECKGOTO(ncclSocketInit(&sock, NULL, NULL, 0), res, out);
NCCLCHECKGOTO(ncclSocketInit(&sock), res, out);
NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
close(sock.fd);
NCCLCHECKGOTO(ncclSocketClose(&sock), res, out);
if (c == 0) {
nranks = info.nranks;
@@ -140,54 +146,60 @@ static void *bootstrapRoot(void* args) {
for (int r=0; r<nranks; ++r) {
int next = (r+1) % nranks;
struct ncclSocket sock;
sock.abortFlag = NULL;
sock.asyncFlag = 0;
memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress));
NCCLCHECKGOTO(ncclSocketInit(&sock, rankAddressesRoot+r, magic, ncclSocketTypeBootstrap), res, out);
NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
close(sock.fd);
NCCLCHECKGOTO(ncclSocketClose(&sock), res, out);
}
TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
out:
close(listenSock->fd);
free(listenSock);
if (listenSock != NULL) {
ncclSocketClose(listenSock);
free(listenSock);
}
if (rankAddresses) free(rankAddresses);
if (rankAddressesRoot) free(rankAddressesRoot);
if (zero) free(zero);
free(rargs);
TRACE(NCCL_INIT, "DONE");
return NULL;
}
ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv) {
struct ncclSocket* listenSock;
NCCLCHECK(ncclCalloc(&listenSock, 1));
memcpy(&listenSock->addr, id, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketListen(listenSock));
memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress));
struct bootstrapRootArgs* args;
pthread_t thread;
pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock);
NCCLCHECK(ncclCalloc(&listenSock, 1));
NCCLCHECK(ncclSocketInit(listenSock, &handle->addr, handle->magic, ncclSocketTypeBootstrap, NULL, 0));
NCCLCHECK(ncclSocketListen(listenSock));
NCCLCHECK(ncclSocketGetAddr(listenSock, &handle->addr));
NCCLCHECK(ncclCalloc(&args, 1));
args->listenSock = listenSock;
args->magic = handle->magic;
NEQCHECK(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), 0);
ncclSetThreadName(thread, "NCCL BootstrapR");
pthread_detach(thread); // will not be pthread_join()'d
NEQCHECK(pthread_detach(thread), 0); // will not be pthread_join()'d
return ncclSuccess;
}
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
static_assert(sizeof(union ncclSocketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
memset(id, 0, sizeof(ncclUniqueId));
union ncclSocketAddress* connectAddr = (union ncclSocketAddress*) id;
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
memset(handle, 0, sizeof(ncclBootstrapHandle));
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
char* env = getenv("NCCL_COMM_ID");
if (env) {
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
if (ncclGetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
if (ncclSocketGetAddrFromString(&handle->addr, env) != ncclSuccess) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
} else {
memcpy(id, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
NCCLCHECK(bootstrapCreateRoot(id, false));
memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
NCCLCHECK(bootstrapCreateRoot(handle, false));
}
return ncclSuccess;
@@ -211,20 +223,27 @@ struct bootstrapState {
int rank;
int nranks;
int virtualId;
uint64_t magic;
volatile uint32_t *abortFlag;
};
ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm) {
int rank = comm->rank;
int nranks = comm->nRanks;
int virtualId = comm->virtualId;
struct bootstrapState* state;
struct ncclSocket* proxySocket;
ncclSocketAddress nextAddr;
struct ncclSocket sock, listenSockRoot;
struct extInfo info = { 0 };
NCCLCHECK(ncclCalloc(&state, 1));
state->rank = rank;
state->nranks = nranks;
state->abortFlag = comm->abortFlag;
state->virtualId = virtualId;
comm->bootstrap = state;
comm->magic = state->magic = handle->magic;
TRACE(NCCL_INIT, "rank %d nranks %d virtualId %d", rank, nranks, virtualId);
@@ -232,23 +251,17 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
RegisterSignalHandlers();
// [/RCCL]
struct extInfo info = { 0 };
info.rank = rank;
info.nranks = nranks;
struct ncclSocket sock, listenSockRoot;
NCCLCHECK(ncclSocketInit(&sock, (union ncclSocketAddress*) id, comm->abortFlag, 0));
NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->abortFlag, 0));
NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->abortFlag, 0));
NCCLCHECK(ncclSocketInit(&state->ringSendSocket, NULL, comm->abortFlag, 0));
NCCLCHECK(ncclSocketInit(&state->ringRecvSocket, NULL, comm->abortFlag, 0));
// Create socket for other ranks to contact me
NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
NCCLCHECK(ncclSocketListen(&state->listenSock));
memcpy(&info.extAddressListen, &state->listenSock.addr, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketGetAddr(&state->listenSock, &info.extAddressListen));
// Create socket for root to contact me
NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
NCCLCHECK(ncclSocketListen(&listenSockRoot));
memcpy(&info.extAddressListenRoot, &listenSockRoot.addr, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot));
// stagger connection times to avoid an overload of the root
if (nranks > 128) {
@@ -261,32 +274,37 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
}
// send info on my listening socket to root
NCCLCHECK(ncclSocketInit(&sock, &handle->addr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
NCCLCHECK(ncclSocketConnect(&sock));
NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info)));
close(sock.fd);
NCCLCHECK(ncclSocketClose(&sock));
// get info on my "next" rank in the bootstrap ring from root
NCCLCHECK(ncclSocketInit(&sock));
NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot));
NCCLCHECK(bootstrapNetRecv(&sock, &state->ringSendSocket.addr, sizeof(union ncclSocketAddress)));
close(sock.fd);
close(listenSockRoot.fd);
NCCLCHECK(bootstrapNetRecv(&sock, &nextAddr, sizeof(union ncclSocketAddress)));
NCCLCHECK(ncclSocketClose(&sock));
NCCLCHECK(ncclSocketClose(&listenSockRoot));
NCCLCHECK(ncclSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
NCCLCHECK(ncclSocketConnect(&state->ringSendSocket));
// Accept the connect request from the previous rank in the AllGather ring
NCCLCHECK(ncclSocketInit(&state->ringRecvSocket));
NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock));
// AllGather all listen handlers
NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
memcpy(state->peerCommAddresses+rank, &state->listenSock.addr, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketGetAddr(&state->listenSock, state->peerCommAddresses+rank));
NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)));
// Create the service proxy
NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
struct ncclSocket* proxySocket;
// proxy is aborted through a message; don't set abortFlag
NCCLCHECK(ncclCalloc(&proxySocket, 1));
NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, NULL, 0));
NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag));
NCCLCHECK(ncclSocketListen(proxySocket));
memcpy(state->peerProxyAddresses+rank, &proxySocket->addr, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank));
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
@@ -322,16 +340,21 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
}
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
ncclResult_t ret = ncclSuccess;
struct bootstrapState* state = (struct bootstrapState*)commState;
struct ncclSocket sock;
NCCLCHECK(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->abortFlag, 1));
NCCLCHECK(ncclSocketConnect(&sock));
NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
NCCLCHECK(bootstrapNetSend(&sock, &tag, sizeof(int)));
NCCLCHECK(bootstrapNetSend(&sock, data, size));
close(sock.fd);
return ncclSuccess;
NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap, state->abortFlag), ret, fail);
NCCLCHECKGOTO(ncclSocketConnect(&sock), ret, fail);
NCCLCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, fail);
exit:
NCCLCHECK(ncclSocketClose(&sock));
return ret;
fail:
goto exit;
}
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag) {
@@ -390,9 +413,10 @@ ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag,
return ncclSuccess;
}
ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock, int* found) {
struct unexConn* elem = state->unexpectedConnections;
struct unexConn* prev = NULL;
*found = 0;
while (elem) {
if (elem->peer == peer && elem->tag == tag) {
if (prev == NULL) {
@@ -402,54 +426,75 @@ ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag,
}
memcpy(sock, &elem->sock, sizeof(struct ncclSocket));
free(elem);
*found = 1;
return ncclSuccess;
}
prev = elem;
elem = elem->next;
}
sock->fd = -1;
return ncclSuccess;
}
static void unexpectedFree(struct bootstrapState* state) {
struct unexConn* elem = state->unexpectedConnections;
struct unexConn* prev = NULL;
while (elem) {
prev = elem;
elem = elem->next;
free(prev);
}
return;
}
// We can't know who we'll receive from, so we need to receive everything at once
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
ncclResult_t ret = ncclSuccess;
struct bootstrapState* state = (struct bootstrapState*)commState;
struct ncclSocket sock;
int newPeer, newTag;
// Search unexpected connections first
NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock));
if (sock.fd != -1) {
NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size));
close(sock.fd);
return ncclSuccess;
int found;
NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock, &found));
if (found) {
NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
goto exit;
}
// Then look for new connections
NCCLCHECK(ncclSocketInit(&sock, NULL, state->listenSock.abortFlag, 0));
while (1) {
NCCLCHECK(ncclSocketAccept(&sock, &state->listenSock));
int newPeer, newTag;
NCCLCHECK(bootstrapNetRecv(&sock, &newPeer, sizeof(int)));
NCCLCHECK(bootstrapNetRecv(&sock, &newTag, sizeof(int)));
NCCLCHECKGOTO(ncclSocketInit(&sock), ret, fail);
NCCLCHECKGOTO(ncclSocketAccept(&sock, &state->listenSock), ret, fail);
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail);
if (newPeer == peer && newTag == tag) {
NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size));
close(sock.fd);
return ncclSuccess;
NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
goto exit;
}
// Unexpected connection. Save for later.
NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, &sock));
NCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail);
}
exit:
NCCLCHECK(ncclSocketClose(&sock));
return ret;
fail:
goto exit;
}
ncclResult_t bootstrapClose(void* commState) {
struct bootstrapState* state = (struct bootstrapState*)commState;
if (state->unexpectedConnections != NULL) {
WARN("Unexpected connections are not empty");
return ncclInternalError;
unexpectedFree(state);
if (*state->abortFlag == 0) {
WARN("Unexpected connections are not empty");
return ncclInternalError;
}
}
if (state->listenSock.fd >= 0) close(state->listenSock.fd);
if (state->ringSendSocket.fd >= 0) close(state->ringSendSocket.fd);
if (state->ringRecvSocket.fd >= 0) close(state->ringRecvSocket.fd);
NCCLCHECK(ncclSocketClose(&state->listenSock));
NCCLCHECK(ncclSocketClose(&state->ringSendSocket));
NCCLCHECK(ncclSocketClose(&state->ringRecvSocket));
free(state->peerCommAddresses);
free(state);
@@ -460,9 +505,9 @@ ncclResult_t bootstrapClose(void* commState) {
ncclResult_t bootstrapAbort(void* commState) {
struct bootstrapState* state = (struct bootstrapState*)commState;
if (commState == NULL) return ncclSuccess;
if (state->listenSock.fd) close(state->listenSock.fd);
if (state->ringSendSocket.fd) close(state->ringSendSocket.fd);
if (state->ringRecvSocket.fd) close(state->ringRecvSocket.fd);
NCCLCHECK(ncclSocketClose(&state->listenSock));
NCCLCHECK(ncclSocketClose(&state->ringSendSocket));
NCCLCHECK(ncclSocketClose(&state->ringRecvSocket));
free(state->peerCommAddresses);
free(state->peerProxyAddresses);
free(state);
+6 -1
Просмотреть файл
@@ -14,7 +14,12 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
// Just pass the size of one message and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
};
size_t msgsize = sendcount * ncclTypeSize(datatype);
NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
+13 -1
Просмотреть файл
@@ -6,6 +6,7 @@
************************************************************************/
#include "enqueue.h"
#include "nccl.h"
#include "msccl/msccl_lifecycle.h"
@@ -13,7 +14,18 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct NvtxParamsAllReduce {
size_t bytes;
ncclRedOp_t op;
};
// Just pass the size of one message and not the total bytes sent/received.
static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsAllReduce, op)}
};
NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
+11 -1
Просмотреть файл
@@ -14,7 +14,17 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct NvtxParamsBroadcast {
size_t bytes;
int root;
};
constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
};
NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
+6 -3
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -490,6 +490,7 @@ struct ncclShmemData {
};
uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
int channelId;
int aborted;
alignas(16) struct ncclDevComm comm;
alignas(16) struct ncclDevChannel channel;
alignas(16) struct ncclWork work;
@@ -499,6 +500,8 @@ struct ncclShmemData {
};
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "ncclShmem.work needs to be 16B aligned");
extern __shared__ ncclShmemData ncclShmem;
#ifdef ENABLE_PROFILING
#define __insert_timestamp(line_num) do { \
if (ncclShmem.prof.count < PROFILE_NUM_ITEMS) { \
@@ -569,8 +572,6 @@ static __forceinline__ __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we
}
}
extern __shared__ ncclShmemData ncclShmem;
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex, bool COLLTRACE, bool USING_LL128>
__forceinline__ __device__ void ncclKernel(
struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
@@ -600,6 +601,8 @@ __forceinline__ __device__ void ncclKernel(
}
__synclds(); // publish ncclShmem.channelId
int channelId = ncclShmem.channelId;
/* set abort flag to 0 */
if (tid == 0) ncclShmem.aborted = 0;
if (true) {
void *dst, *src;
+11 -5
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
@@ -93,7 +93,10 @@ private:
inline __device__ bool checkAbort(int &spins) {
spins++;
if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
flags |= atomicAdd_system((unsigned int *)ncclShmem.comm.abortFlag, 0) ? Aborted : 0;
if (atomicAdd_system((unsigned int *)ncclShmem.comm.abortFlag, 0)) {
flags |= Aborted;
ncclShmem.aborted = 1;
}
spins = 0;
}
return flags & Aborted;
@@ -207,6 +210,9 @@ private:
ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
subBarrier();
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
* to 0 to avoid unnecessary workload. */
size_t workSize = ncclShmem.aborted ? 0 : sliceSize;
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (Send) {
@@ -229,7 +235,7 @@ private:
(tid, nworkers, nullptr, false,
1, (T const**)ncclShmem.groups[group].srcs,
fan.nsend(), (T**)ncclShmem.groups[group].dsts+1,
sliceSize);
workSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
@@ -266,7 +272,7 @@ private:
(tid, nworkers, ncclShmem.redOpArgs, postOp,
Recv, (T const**)ncclShmem.groups[group].srcs,
Dst, (T**)ncclShmem.groups[group].dsts,
sliceSize);
workSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
@@ -303,7 +309,7 @@ private:
(tid, nworkers, ncclShmem.redOpArgs, postOp,
Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs,
Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts,
sliceSize);
workSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
+14 -1
Просмотреть файл
@@ -7,6 +7,7 @@
#include "enqueue.h"
#include "collectives.h"
#include "nccl.h"
#include "msccl/msccl_lifecycle.h"
@@ -14,7 +15,19 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct NvtxParamsReduce {
size_t bytes;
int root;
ncclRedOp_t op;
};
constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduce, op)}
};
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
+12 -1
Просмотреть файл
@@ -7,6 +7,7 @@
#include "enqueue.h"
#include "collectives.h"
#include "nccl.h"
#include "msccl/msccl_lifecycle.h"
@@ -14,7 +15,17 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
struct NvtxParamsReduceScatter {
size_t bytes;
ncclRedOp_t op;
};
constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduceScatter, op)}
};
NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
+13 -2
Просмотреть файл
@@ -11,11 +11,21 @@
#include "msccl/msccl_lifecycle.h"
struct NvtxParamsSendRecv {
size_t bytes;
int peer;
};
constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
};
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
@@ -37,7 +47,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
+71 -50
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -577,8 +577,7 @@ static ncclResult_t scheduleP2pTasksToPlan(
// Compute how much to split operations
// Natural step size matching buffer steps.
ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
if (comm->nNodes > 1) stepSize = comm->p2pNetChunkSize;
ssize_t stepSize = comm->p2pChunkSize;
// Try to use all channels
int nChannelsMax = comm->p2pnChannelsPerPeer;
int nChannelsMin = nChannelsMax;
@@ -991,69 +990,87 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
#if CUDART_VERSION >= 11080
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", 0);
#define NCCL_CGA_CLUSTER_SIZE_SM90 4
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", -2);
#endif
#if CUDART_VERSION >= 12000
// NCCL uses the "Remote" Mem Sync domain by default
NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
#endif
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
struct ncclTasks* tasks = &comm->tasks;
void *fn = plan->kernelFn;
hipStream_t launchStream = tasks->streams->stream;
cudaStream_t launchStream = tasks->streams->stream;
dim3 grid = {(unsigned)plan->channelCount, 1, 1};
dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead};
if (tasks->numStreams == 1) {
CUDACHECK(hipExtLaunchKernel(plan->kernelFn, grid, block, args, 0, tasks->streams->stream, NULL, comm->doneEvent, 0));
comm->lastStream = tasks->streams->stream;
} else {
#if CUDART_VERSION >= 11080
int driverVersion;
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
return ncclSuccess;
}
unsigned int clusterSize = 0;
clusterSize = ncclParamCGAClusterSize();
if (clusterSize > NCCL_MAX_CGA_CLUSTER_SIZE) {
static bool warned = false;
if (warned == false) {
WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.",
clusterSize, NCCL_MAX_CGA_CLUSTER_SIZE);
warned = true;
#if CUDART_VERSION >= 11080
int driverVersion;
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
if (driverVersion >= 11080) {
int compCap = comm->compCap;
unsigned int clusterSize = (compCap == 90) ? NCCL_CGA_CLUSTER_SIZE_SM90 : 0;
if (ncclParamCGAClusterSize() != -2) {
clusterSize = ncclParamCGAClusterSize();
if (clusterSize > NCCL_MAX_CGA_CLUSTER_SIZE) {
static bool warned = false;
if (warned == false) {
WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.",
clusterSize, NCCL_MAX_CGA_CLUSTER_SIZE);
warned = true;
}
clusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
}
clusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
}
if (clusterSize && driverVersion >= 11080) {
cudaLaunchConfig_t launchConfig = {0};
cudaLaunchAttribute launchAttrs[2];
/* Cooperative Group Array (CGA)
* On sm90 and later we have an extra level of hierarchy where we
* can group together several blocks within the Grid, called
* Thread Block Clusters.
* Clusters enable multiple thread blocks running concurrently
* across multiple SMs to synchronize and collaboratively fetch
* and exchange data. A cluster of blocks are guaranteed to be
* concurrently scheduled onto a group of SMs.
* The maximum value is 8 and it must be divisible into the grid dimensions
*/
cudaLaunchConfig_t launchConfig = {0};
cudaLaunchAttribute launchAttrs[3];
int attrs = 0;
/* Cooperative Group Array (CGA)
* On sm90 and later we have an extra level of hierarchy where we
* can group together several blocks within the Grid, called
* Thread Block Clusters.
* Clusters enable multiple thread blocks running concurrently
* across multiple SMs to synchronize and collaboratively fetch
* and exchange data. A cluster of blocks are guaranteed to be
* concurrently scheduled onto a group of SMs.
* The maximum value is 8 and it must be divisible into the grid dimensions
*/
if (clusterSize) {
// Grid dimension must be divisible by clusterSize
if (grid.x % clusterSize) clusterSize = 1;
launchAttrs[0].id = cudaLaunchAttributeClusterDimension;
launchAttrs[0].val.clusterDim = {clusterSize, 1, 1};
launchAttrs[1].id = cudaLaunchAttributeClusterSchedulingPolicyPreference;
launchAttrs[1].val.clusterSchedulingPolicyPreference = cudaClusterSchedulingPolicySpread;
launchConfig.gridDim = grid;
launchConfig.blockDim = block;
launchConfig.attrs = launchAttrs;
launchConfig.numAttrs = sizeof(launchAttrs)/sizeof(launchAttrs[0]);
launchConfig.stream = launchStream;
CUDACHECK(cudaLaunchKernelExC(&launchConfig, fn, args));
return ncclSuccess;
launchAttrs[attrs].id = cudaLaunchAttributeClusterDimension;
launchAttrs[attrs++].val.clusterDim = {clusterSize, 1, 1};
launchAttrs[attrs].id = cudaLaunchAttributeClusterSchedulingPolicyPreference;
launchAttrs[attrs++].val.clusterSchedulingPolicyPreference = cudaClusterSchedulingPolicySpread;
}
#if CUDART_VERSION >= 12000
if (compCap >= 90 && driverVersion >= 12000) {
// Set the NCCL Mem Sync domain on CUDA 12.0 and later (sm90)
launchAttrs[attrs].id = cudaLaunchAttributeMemSyncDomain;
launchAttrs[attrs++].val.memSyncDomain = (cudaLaunchMemSyncDomain) ncclParamMemSyncDomain();
}
#endif
// Standard kernel launch
CUDACHECK(cudaLaunchKernel(fn, grid, block, args, 0, launchStream));
launchConfig.gridDim = grid;
launchConfig.blockDim = block;
launchConfig.attrs = launchAttrs;
launchConfig.numAttrs = attrs;
launchConfig.stream = launchStream;
CUDACHECK(cudaLaunchKernelExC(&launchConfig, fn, args));
return ncclSuccess;
}
#endif
// Standard kernel launch
CUDACHECK(cudaLaunchKernel(fn, grid, block, args, 0, launchStream));
return ncclSuccess;
}
@@ -1080,14 +1097,18 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
// back to us for reclaiming via callbackQueue.
ncclIntruQueueConstruct(&comm->planQueue);
cudaStream_t launchStream = tasks->streams->stream; // First user stream gets launch
// Create dependency for deviceStream on launchStream.
if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, launchStream), result, resume1);
// Create dependency for deviceStream on launchStream. We know that deviceStream
// hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
// so we can say that launchStream subsumes it.
if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
resume1:
// Create dependency for other user streams (skip launch stream).
// Create dependency for other user streams (skip launch stream) on deviceStream.
// Again, the user streams haven't been touched since deviceStream waited on them
// so we can say they are subsumed by deviceStream.
struct ncclCudaStreamList* sl = tasks->streams->next;
tasks->streams = nullptr; // Reset comm->tasks.streams to empty.
while (sl != nullptr && tasks->numStreams != 1) {
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream), result, resume2);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream, /*b_subsumes_a=*/true), result, resume2);
resume2:
sl = sl->next;
}
+13 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -866,3 +866,15 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
*nranks = nvbGpus;
return ncclSuccess;
}
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) {
int minPath = PATH_DIS;
for (int i=0; i<system->nodes[GPU].count; i++) {
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
for (int j=0; j<system->nodes[GPU].count; j++) {
if (i == j) continue;
minPath = std::min(minPath, paths[j].type);
}
}
return minPath >= PATH_PIX ? 0 : 1;
}
+4 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -78,6 +78,9 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
*bw = AMD_BW;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
}
+2 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -19,6 +19,7 @@
#define PCI_BW 12.0 // PCI Gen3 x16
#define QPI_BW 6.0
#define SKL_QPI_BW 9.0
#define AMD_BW 16.0
#define ZPI_BW 6.0
#define YONGFENG_ZPI_BW 9.0
#define P9_BW 32.0
+9 -3
Просмотреть файл
@@ -10,10 +10,16 @@
#include "nccl.h"
#include "comm.h"
struct ncclBootstrapHandle {
uint64_t magic;
union ncclSocketAddress addr;
};
static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
ncclResult_t bootstrapInit(ncclUniqueId* id, struct ncclComm* comm);
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
+5 -3
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -177,9 +177,12 @@ struct ncclComm {
uint64_t* connectSend;
uint64_t* connectRecv;
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
int compCap; // compute capability of the GPU
int64_t busId; // my PCI bus ID in int format
cpu_set_t cpuAffinity; // CPU affinity of the GPU
int WarpSize;
@@ -216,7 +219,7 @@ struct ncclComm {
// Buffer sizes
int buffSizes[NCCL_NUM_PROTOCOLS];
int p2pNetChunkSize;
int p2pChunkSize;
// Algorithm/Protocols thresholds
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -253,7 +256,6 @@ struct ncclComm {
// Intra-process sync
struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
int intraRefs; // reference count from intra-process comms (zero if not leader else intraRanks)
int intraRank;
int intraRanks;
uint32_t intraBarrierPhase;
+38 -3
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -159,20 +159,55 @@ typedef struct gdr_mem_desc {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
static gdr_t ncclGdrInit() {
return NULL;
INFO(NCCL_INIT, "Enabled GDRCopy equivalent memory allocation");
return (gdr_t)0x12345678L;
}
template <typename T>
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle, hipStream_t stream) {
gdr_info_t info;
size_t mapSize;
gdr_mh_t mh;
char *devMem;
void *gdrMap;
mapSize = sizeof(T)*nelem;
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
// GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, true));
gdr_mem_desc_t* md;
NCCLCHECK(ncclCalloc(&md, 1));
md->gdrDevMem = devMem;
md->gdrMap = NULL;
md->gdrMapSize = mapSize;
md->gdrOffset = 0;
md->gdrMh.h = 0;
*gdrHandle = md;
*ptr = (T *)(devMem);
if (devPtr) *devPtr = (T *)(devMem);
TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
memcpy(dst, src, nelem*sizeof(T));
return ncclSuccess;
}
static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
CUDACHECK(hipFree(md->gdrDevMem));
free(md);
return ncclSuccess;
}
#else
+2 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -29,6 +29,7 @@ void ncclTopoFree(struct ncclTopoSystem* system);
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
// Query topology
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
+71
Просмотреть файл
@@ -9,6 +9,77 @@
#include "nvtx3.hpp"
#if __cpp_constexpr >= 201304L && !defined(NVTX3_RELAXED_CONSTEXPR)
#define NVTX3_RELAXED_CONSTEXPR constexpr
#else
#define NVTX3_RELAXED_CONSTEXPR
#endif
// Define all NCCL-provided static schema IDs here (avoid duplicates).
#define NVTX_SID_CommInitRank 0
#define NVTX_SID_CommInitAll 1
#define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_AllGather 4
#define NVTX_SID_AllReduce 5
#define NVTX_SID_Broadcast 6
#define NVTX_SID_ReduceScatter 7
#define NVTX_SID_Reduce 8
#define NVTX_SID_Send 9
#define NVTX_SID_Recv 10
// Define static schema ID for the reduction operation.
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
struct nccl_domain{static constexpr char const* name{"NCCL"};};
class payload_schema {
public:
NVTX3_RELAXED_CONSTEXPR explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
{
schema_attr.name = schemaName;
schema_attr.entries = entries;
schema_attr.numEntries = numEntries;
schema_attr.schemaId = schemaId;
nvtxPayloadSchemaRegister(nvtx3::domain::get<nccl_domain>(), &schema_attr);
}
payload_schema() = delete;
~payload_schema() = default;
payload_schema(payload_schema const&) = default;
payload_schema& operator=(payload_schema const&) = default;
payload_schema(payload_schema&&) = default;
payload_schema& operator=(payload_schema&&) = default;
private:
nvtxPayloadSchemaAttr_t schema_attr{
NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES |
NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE |
NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
nullptr,
NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
nullptr, 0, 0, 0};
};
// Create NVTX push/pop range with parameters
// @param name of the operation (see `NVTX_SID_*`)
// @param N schema name
// @param S schema (entries)
// @param P payload (struct)
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
static const payload_schema schema{S, std::extent<decltype(S)>::value, \
NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
static ::nvtx3::v1::registered_string<nccl_domain> const nvtx3_func_name__{__func__}; \
nvtxPayloadData_t nvtx3_bpl__[] = { \
{NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
::nvtx3::v1::event_attributes nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
::nvtx3::v1::domain_thread_range<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
extern void initNvtxRegisteredEnums();
#endif
+17
Просмотреть файл
@@ -92,6 +92,7 @@
/* clang-format on */
#include <nvtx3/nvToolsExt.h>
#include <nvtx3/nvToolsExtPayload.h>
#include <memory>
#include <string>
@@ -1732,6 +1733,22 @@ class event_attributes {
attributes_.messageType = m.get_type();
}
/**
* @brief Variadic constructor where the first argument is a binary payload.
*
* Sets the value of the `EventAttribute`s message based on `m` and forwards
* the remaining variadic parameter pack to the next constructor.
*
*/
template <typename... Args>
NVTX3_RELAXED_CONSTEXPR explicit event_attributes(nvtxPayloadData_t const* bpl, Args const&... args) noexcept
: event_attributes(args...)
{
attributes_.payloadType = NVTX_PAYLOAD_TYPE_BINARY;
attributes_.reserved0 = 1; // NCCL uses only a single binary payload per event.
attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(bpl);
}
~event_attributes() = default;
event_attributes(event_attributes const&) = default;
event_attributes& operator=(event_attributes const&) = default;
+15 -15
Просмотреть файл
@@ -8,7 +8,7 @@
#include "nvToolsExt.h"
#include "hip/hip_runtime.h"
#include "cuda.h"
#ifndef NVTOOLSEXT_CUDA_V3
#define NVTOOLSEXT_CUDA_V3
@@ -42,10 +42,10 @@ extern "C" {
*/
typedef enum nvtxResourceCUDAType_t
{
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* hipDevice_t */
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* hipCtx_t */
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* hipStream_t */
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* hipEvent_t */
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
} nvtxResourceCUDAType_t;
@@ -59,8 +59,8 @@ typedef enum nvtxResourceCUDAType_t
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(hipDevice_t device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
@@ -73,16 +73,16 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t*
*
* \par Example:
* \code
* hipError_t status = hipCtxCreate( &cuContext, 0, cuDevice );
* if ( hipSuccess != status )
* CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
* if ( CUDA_SUCCESS != status )
* goto Error;
* nvtxNameCuContext(cuContext, "CTX_NAME");
* \endcode
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(hipCtx_t context, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
@@ -95,8 +95,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t*
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(hipStream_t stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
@@ -109,8 +109,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t*
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(hipEvent_t event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(hipEvent_t event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
+8 -8
Просмотреть файл
@@ -8,8 +8,8 @@
#include "nvToolsExt.h"
#include "hip/hip_runtime.h"
#include "hip/driver_types.h"
#include "cuda.h"
#include "driver_types.h"
#ifndef NVTOOLSEXT_CUDART_V3
#define NVTOOLSEXT_CUDART_V3
@@ -44,8 +44,8 @@ extern "C" {
typedef enum nvtxResourceCUDARTType_t
{
NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* hipStream_t */
NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* hipEvent_t */
NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
} nvtxResourceCUDARTType_t;
@@ -73,8 +73,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(hipStream_t stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
@@ -87,8 +87,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(hipEvent_t event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(hipEvent_t event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
+776
Просмотреть файл
@@ -0,0 +1,776 @@
/*
* Copyright 2021 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvtx3/nvToolsExt.h"
#ifndef NVTOOLSEXT_PAYLOAD_H
#define NVTOOLSEXT_PAYLOAD_H
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/**
* \brief A compatibility ID value used in initialization to identify version
* differences.
*/
#define NVTX_EXT_COMPATID_PAYLOAD 0x0103
/**
* \brief This module ID identifies the payload extension. It has to be unique
* among the extension modules.
*/
#define NVTX_EXT_MODULEID_PAYLOAD 2
/**
* \brief Additional values for the enum @ref nvtxPayloadType_t
*/
#define NVTX_PAYLOAD_TYPE_BINARY ((int32_t)0xDFBD0009)
/** ---------------------------------------------------------------------------
* Payload schema entry flags.
* ------------------------------------------------------------------------- */
#define NVTX_PAYLOAD_ENTRY_FLAG_UNUSED 0
/**
* Absolute pointer into a payload (entry) of the same event.
*/
#define NVTX_PAYLOAD_ENTRY_FLAG_POINTER (1 << 1)
/**
* Offset from base address of the payload.
*/
#define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_BASE (1 << 2)
/**
* Offset from the end of this payload entry.
*/
#define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_HERE (1 << 3)
/**
* The value is an array with fixed length, set with the field `arrayLength`.
*/
#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE (1 << 4)
/**
* The value is a zero-/null-terminated array.
*/
#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED (2 << 4)
/**
* \brief A single or multi-dimensional array of variable length.
*
* The field `arrayLength` contains the index of the schema entry that holds the
* length(s). If the other field points to a scalar entry then this will be the
* 1D array. If the other field points to a FIXED_SIZE array, then the number of
* dimensions is defined with the registration of the scheme. If the other field
* is ZERO_TERMINATED, the array the dimensions can be determined at runtime.
*/
#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX (3 << 4)
/**
* A tool may not support deep copy and just ignore this flag.
* See @ref NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY for more details.
*/
#define NVTX_PAYLOAD_ENTRY_FLAG_DEEP_COPY (1 << 9)
/**
* The entry specifies the message in a deferred event. The entry type can be
* any string type. The flag is ignored for schemas that are not flagged with
* `NVTX_PAYLOAD_SCHEMA_FLAG_RANGE*` or `NVTX_PAYLOAD_SCHEMA_FLAG_MARK`.
*/
#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE (1 << 10)
/**
* @note The array flags assume that the array is embedded. Otherwise,
* @ref NVTX_PAYLOAD_ENTRY_FLAG_POINTER has to be additionally specified. Some
* combinations may be invalid based on the `NVTX_PAYLOAD_SCHEMA_TYPE_*` this
* entry is enclosed. For instance, variable length embedded arrays are valid
* within @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC but invalid with
* @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC. See `NVTX_PAYLOAD_SCHEMA_TYPE_*` for
* additional details.
*/
/* Helper macro to check if an entry represents an array. */
#define NVTX_PAYLOAD_ENTRY_FLAG_IS_ARRAY (\
NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE | \
NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED | \
NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX)
/** ---------------------------------------------------------------------------
* Types of entries in a payload schema.
* ------------------------------------------------------------------------- */
/**
* @note Several of the predefined types contain the size (in bits) in their
* names. For some data types the size (in bytes) is not fixed and may differ
* for different platforms/operating systems/compilers. To provide portability,
* an array of sizes (in bytes) for type 1 to 28 ( @ref
* NVTX_PAYLOAD_ENTRY_TYPE_CHAR to @ref NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE)
* is passed to the NVTX extension initialization function
* @ref InitializeInjectionNvtxExtension via the `extInfo` field of
* @ref nvtxExtModuleInfo_t.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_INVALID 0
/**
* Basic integer types.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR 1
#define NVTX_PAYLOAD_ENTRY_TYPE_UCHAR 2
#define NVTX_PAYLOAD_ENTRY_TYPE_SHORT 3
#define NVTX_PAYLOAD_ENTRY_TYPE_USHORT 4
#define NVTX_PAYLOAD_ENTRY_TYPE_INT 5
#define NVTX_PAYLOAD_ENTRY_TYPE_UINT 6
#define NVTX_PAYLOAD_ENTRY_TYPE_LONG 7
#define NVTX_PAYLOAD_ENTRY_TYPE_ULONG 8
#define NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG 9
#define NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG 10
/**
* Integer types with explicit size.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_INT8 11
#define NVTX_PAYLOAD_ENTRY_TYPE_UINT8 12
#define NVTX_PAYLOAD_ENTRY_TYPE_INT16 13
#define NVTX_PAYLOAD_ENTRY_TYPE_UINT16 14
#define NVTX_PAYLOAD_ENTRY_TYPE_INT32 15
#define NVTX_PAYLOAD_ENTRY_TYPE_UINT32 16
#define NVTX_PAYLOAD_ENTRY_TYPE_INT64 17
#define NVTX_PAYLOAD_ENTRY_TYPE_UINT64 18
/**
* C floating point types
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT 19
#define NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE 20
#define NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE 21
/**
* Size type (`size_t`)
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_SIZE 22
/**
* Any address, e.g. `void*`. If the pointer type matters, use the flag @ref
* NVTX_PAYLOAD_ENTRY_FLAG_POINTER and the respective type instead.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS 23
/**
* Special character types.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_WCHAR 24 /* wide character (since C90) */
#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 25 /* since C2x and C++20 */
#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 26
#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 27
/**
* There is type size and alignment information for all previous types.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE (NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 + 1)
/**
* Store raw 8-bit binary data. As with `char`, 1-byte alignment is assumed.
* Typically a tool will display this as hex or binary.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_BYTE 32
/**
* These types do not have standardized equivalents. It is assumed that the
* number at the end corresponds to the bits used to store the value and that
* the alignment corresponds to standardized types of the same size.
* A tool may not support these types.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_INT128 33
#define NVTX_PAYLOAD_ENTRY_TYPE_UINT128 34
#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT16 42
#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT32 43
#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT64 44
#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT128 45
#define NVTX_PAYLOAD_ENTRY_TYPE_BF16 50
#define NVTX_PAYLOAD_ENTRY_TYPE_TF32 52
/**
* These types are normalized numbers stored in integers. UNORMs represent 0.0
* to 1.0 and SNORMs represent -1.0 to 1.0. The number after represents the
* number of integer bits. Alignment is take from equivalent types INT# matching
* to SNORM# and UINT# matching to UNORM#.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM8 61
#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM8 62
#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM16 63
#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM16 64
#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM32 65
#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM32 66
#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM64 67
#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM64 68
/**
* String types.
*
* If `arrayOrUnionDetail` is greater than `0`, the entry is a fixed-size string
* with the provided length.
*
* `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE` is ignored for string types. It
* just specifies once more that the entry is a fixed-size string.
*
* Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED` indicates a
* zero-terminated string. If `arrayOrUnionDetail` is greater than `0`, a zero-
* terminated array of fixed-size strings is assumed.
*
* Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX` specifies the
* entry index of the entry which contains the string length. It is not possible
* to describe a variable length array of strings.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING 75 /* `char*`, system LOCALE */
#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF8 76
#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF16 77
#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF32 78
/**
* @ref nvtxStringHandle_t returned by @ref nvtxDomainRegisterString
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE 80
/**
* Entry types to be used in deferred events. Data types are as defined by
* NVTXv3 core: category -> uint32_t, color -> uint32_t, color type -> int32_t.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_CATEGORY 90
#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_COLORTYPE 91
#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_COLOR 92
/**
* This type marks the union selector member (entry index) in schemas used by
* a union with internal internal selector.
* See @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_UNION_SELECTOR 100
/**
* Timestamp types occupy the range from 128 to 255
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP64 128 /* data type is uint64_t */
/**
* CPU timestamp sources.
* \todo All 64 bits?
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_TSC 129
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_TSC_NONVIRTUALIZED 130
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_REALTIME 131
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_REALTIME_COARSE 132
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC 133
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC_RAW 134
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC_COARSE 135
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_BOOTTIME 136
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_PROCESS_CPUTIME_ID 137
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_THREAD_CPUTIME_ID 138
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_QPC 160
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_GSTAFT 161
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_GSTAFTP 162
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_TIME 163
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_CLOCK 164
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_TIMESPEC_GET 165
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_STEADY_CLOCK 166
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_HIGH_RESOLUTION_CLOCK 167
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_SYSTEM_CLOCK 168
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_UTC_CLOCK 169
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_TAI_CLOCK 170
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_GPS_CLOCK 171
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_FILE_CLOCK 172
/**
* \brief GPU timestamp sources.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_GLOBALTIMER 192
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_SM_CLOCK 193
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_SM_CLOCK64 194
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_CUPTI 195
/**
* The timestamp was provided by the NVTX handlers timestamp routine.
*/
#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_TOOL_PROVIDED 224
/**
* This predefined schema ID can be used in `nvtxPayloadData_t` to indicate that
* the payload is a blob of memory which other payload entries may point into.
* A tool will not expose this payload directly.
*/
#define NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED 1022
/**
* This predefined schema ID can be used in `nvtxPayloadData_t` to indicate that
* the payload is a blob which can be shown with an arbitrary data viewer.
*/
#define NVTX_TYPE_PAYLOAD_SCHEMA_RAW 1023
/* Custom (static) schema IDs. */
#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START (1 << 24)
/* Dynamic schema IDs (generated by the tool) start here. */
#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START 4294967296 // 1 << 32
/**
* \brief Size and alignment information for predefined payload entry types.
*
* The struct contains the size and the alignment size in bytes. A respective
* array for the predefined types is passed via nvtxExtModuleInfo_t to the NVTX
* client/handler. The type (ID) is used as index into this array.
*/
typedef struct nvtxPayloadEntryTypeInfo_t
{
uint16_t size;
uint16_t align;
} nvtxPayloadEntryTypeInfo_t;
/**
* \brief Entry in a schema.
*
* A payload schema consists of an array of payload schema entries. It is
* registered with @ref nvtxPayloadSchemaRegister. `flag` can be set to `0` for
* simple values, 'type' is the only "required" field. If not set explicitly,
* all other fields are zero-initialized, which means that the entry has no name
* and the offset is determined based on self-alignment rules.
*
* Example schema:
* nvtxPayloadSchemaEntry_t desc[] = {
* {0, NVTX_EXT_PAYLOAD_TYPE_UINT8, "one byte"},
* {0, NVTX_EXT_PAYLOAD_TYPE_INT32, "four bytes"}
* };
*/
typedef struct nvtxPayloadSchemaEntry_t
{
/**
* \brief Flags to augment the basic type.
*
* This field allows additional properties of the payload entry to be
* specified. Valid values are `NVTX_PAYLOAD_ENTRY_FLAG_*`.
*/
uint64_t flags;
/**
* \brief Predefined payload schema entry type or ID of a registered payload
* schema.
*/
uint64_t type;
/**
* \brief Name of the payload entry. (Optional)
*
* Providing a name is useful to give a meaning to the associated value.
*/
const char* name;
/**
* \brief Description of the payload entry. (Optional)
*/
const char* description;
/**
* \brief String or array length or union selector for union types.
*
* If @ref type is a C string type, this defines the length of the string.
*
* If @ref flags specify that the entry is an array, this field defines the
* length of the array. See `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_*` for more
* details.
*
* If @ref type implies that the entry is a union with schema type
* @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION (external selection of the union
* member), this field contains the index (starting with 0) to an entry of
* integer type in the same schema. The associated field contains the
* selected union member.
*
* @note An array of schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION is not
* supported. @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR can
* be used instead.
*/
uint64_t arrayOrUnionDetail;
/**
* \brief Offset in the binary payload data (in bytes).
*
* This field specifies the byte offset from the base address of the actual
* binary data (blob) to the data of this entry.
*
* This is an optional field, but it is recommended to specify this field to
* avoid issues in the automatic detection of the offset by a tool/handler.
*/
uint64_t offset;
/**
* Semantics are not yet defined.
*/
void* semantics;
/**
* Reserved for future use. Do not use it!
*/
void* reserved;
} nvtxPayloadSchemaEntry_t;
/**
* \brief Binary payload data, size and decoding information.
*
* An array of nvtxPayloadData_t is passed to the NVTX event attribute payload
* member. To attach a single payload the macro @ref NVTX_EXT_PAYLOAD_SET_ATTR
* can be used.
*/
typedef struct nvtxPayloadData_t
{
/**
* The schema ID, which defines the layout of the binary data.
*/
uint64_t schemaId;
/**
* Size of the binary payload (blob) in bytes.
*/
size_t size;
/**
* Pointer to the binary payload data.
*/
const void* payload;
} nvtxPayloadData_t;
/* Helper macros for safe double-cast of pointer to uint64_t value */
#ifndef NVTX_POINTER_AS_PAYLOAD_ULLVALUE
# ifdef __cplusplus
# define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) \
static_cast<uint64_t>(reinterpret_cast<uintptr_t>(p))
# else
#define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) ((uint64_t)(uintptr_t)p)
# endif
#endif
#define NVTX_PAYLOAD_CONCAT2(a,b) a##b
#define NVTX_PAYLOAD_CONCAT(a,b) NVTX_PAYLOAD_CONCAT2(a,b)
#define NVTX_DATA_VAR NVTX_PAYLOAD_CONCAT(nvtxDFDB,__LINE__)
/**
* \brief Helper macro to attach a single payload to an NVTX event attribute.
*
* @note The NVTX push, start or mark operation must not be in the same or a
* nested scope.
*/
#define NVTX_PAYLOAD_EVTATTR_SET(EVTATTR, SCHEMA_ID, PAYLOAD_ADDR, SIZE) \
nvtxPayloadData_t NVTX_DATA_VAR[] = {{SCHEMA_ID, SIZE, PAYLOAD_ADDR}}; \
(EVTATTR).payload.ullValue = \
NVTX_POINTER_AS_PAYLOAD_ULLVALUE(NVTX_DATA_VAR); \
(EVTATTR).payloadType = NVTX_PAYLOAD_TYPE_BINARY; \
(EVTATTR).reserved0 = 1;
/**
* \brief Helper macro to attach multiple payloads to an NVTX event attribute.
*
* The payload data array (`nvtxPayloadData_t`) is passed as first argument to
* this macro.
*/
#define NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE(EVTATTR, PAYLOADS) \
(EVTATTR).payloadType = NVTX_PAYLOAD_TYPE_BINARY; \
(EVTATTR).reserved0 = sizeof(PAYLOADS)/sizeof(nvtxPayloadData_t); \
(EVTATTR).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(PAYLOADS);
/**
* \brief The payload schema type.
*
* A schema can be either of these types.
*/
enum nvtxPayloadSchemaType
{
NVTX_PAYLOAD_SCHEMA_TYPE_INVALID = 0,
NVTX_PAYLOAD_SCHEMA_TYPE_STATIC = 1,
NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC = 2,
NVTX_PAYLOAD_SCHEMA_TYPE_UNION = 3,
NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR = 4
};
/**
* \brief Flags for static and dynamic schemas.
*/
enum nvtxPayloadSchemaFlags
{
NVTX_PAYLOAD_SCHEMA_FLAG_NONE = 0,
/**
* This flag indicates that a schema and the corresponding payloads can
* contain fields which require a deep copy.
*/
NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY = (1 << 1),
/**
* This flag indicates that a schema and the corresponding payloads can
* be referenced by another payload of the same event.
*/
NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED = (1 << 2),
/**
* The schema describes a deferred event/marker. Such a schema requires one
* timestamp entry and one string entry with the flag
* `NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE`. Category and color can be
* optionally specified with the respective entry types. The deferred event
* can contain a binary payload itself by using a custom schema ID as type
* its schema description. Multiple occurrences of the same event can be
* described by specifying an array timestamps.
*/
NVTX_PAYLOAD_SCHEMA_FLAG_DEFERRED_EVENT = (1 << 3),
/**
* The schema describes a deferred event/marker. Such a schema requires
* one start timestamp, one end timestamp and one string entry with the flag
* `NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE`. Category and color can be
* optionally specified with the respective entry types. The deferred range
* can contain a binary payload itself by using a custom schema ID as type
* its schema description.
*
* Timestamps can be provided in different ways:
* - A single range has two timestamp entries with the first (smaller entry
* index) being used as the start/push timestamp.
* - If the range schema contains one array of timestamps, the tool assumes
* that the array contains alternating start and end timestamps.
* - If two timestamp arrays are specified the first entry (with the
* smaller entry index) is assumed to contain the start timestamps. Both
* arrays have to be of the same size.
*/
NVTX_PAYLOAD_SCHEMA_FLAG_DEFERRED_RANGE = (2 << 3)
};
/**
* The values allow the valid fields in @ref nvtxPayloadSchemaAttr_t to be
* specified via setting the field `fieldMask`.
*/
#define NVTX_PAYLOAD_SCHEMA_ATTR_NAME (1 << 1)
#define NVTX_PAYLOAD_SCHEMA_ATTR_TYPE (1 << 2)
#define NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS (1 << 3)
#define NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES (1 << 4)
#define NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES (1 << 5)
#define NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE (1 << 6)
#define NVTX_PAYLOAD_SCHEMA_ATTR_ALIGNMENT (1 << 7)
#define NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID (1 << 8)
/**
* NVTX payload schema attributes.
*/
typedef struct nvtxPayloadSchemaAttr_t
{
/**
* \brief Mask of valid fields in this structure.
*
* The values from `enum nvtxPayloadSchemaAttributes` have to be used.
*/
uint64_t fieldMask;
/**
* \brief Name of the payload schema. (Optional)
*/
const char* name;
/**
* \brief Payload schema type. (Mandatory) \anchor PAYLOAD_TYPE_FIELD
*
* A value from `enum nvtxPayloadSchemaType` has to be used.
*/
uint64_t type;
/**
* \brief Payload schema flags. (Optional)
*
* Flags defined in `enum nvtxPayloadSchemaFlags` can be used to set
* additional properties of the schema.
*/
uint64_t flags;
/**
* \brief Entries of a payload schema. (Mandatory) \anchor ENTRIES_FIELD
*
* This field is a pointer to an array of schema entries, each describing a
* field in a data structure, e.g. in a C struct or union.
*/
const nvtxPayloadSchemaEntry_t* entries;
/**
* \brief Number of entries in the payload schema. (Mandatory)
*
* Number of entries in the array of payload entries \ref ENTRIES_FIELD.
*/
size_t numEntries;
/**
* \brief The binary payload size in bytes for static payload schemas.
*
* If \ref PAYLOAD_TYPE_FIELD is @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC this
* value is ignored. If this field is not specified for a schema of type
* @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, the size can be automatically
* determined by a tool.
*/
size_t payloadStaticSize;
/**
* \brief The byte alignment for packed structures.
*
* If not specified, this field defaults to `0`, which means that the fields
* in the data structure are not packed and natural alignment rules can be
* applied.
*/
size_t packAlign;
/* Static/custom schema ID must be
>= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and
< NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */
uint64_t schemaId;
} nvtxPayloadSchemaAttr_t;
/**
* \brief Register a payload schema.
*
* @param domain NVTX domain handle.
* @param attr NVTX payload schema attributes.
*/
NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadSchemaRegister(
nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr);
/**
* \brief Enumeration entry.
*
* Since the value of an enum entry might not be meaningful for the analysis,
* a tool can show the name of enum entry instead.
*
* @note EXPERIMENTAL
*/
typedef struct nvtxPayloadEnum_t
{
/**
* Name of the enum value.
*/
const char* name;
/**
* Value of the enum entry.
*/
uint64_t value;
/**
* Indicates that this entry sets a specific set of bits, which can be used
* to easily define bitsets.
*/
int8_t isFlag;
} nvtxPayloadEnum_t;
/**
* The values are used to set the field `fieldMask` and specify which fields in
* `nvtxPayloadEnumAttr_t` are set.
*/
#define NVTX_PAYLOAD_ENUM_ATTR_NAME (1 << 1)
#define NVTX_PAYLOAD_ENUM_ATTR_ENTRIES (1 << 2)
#define NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES (1 << 3)
#define NVTX_PAYLOAD_ENUM_ATTR_SIZE (1 << 4)
#define NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID (1 << 5)
/**
* NVTX payload enumeration type attributes.
*/
typedef struct nvtxPayloadEnumAttr_t {
/**
* Mask of valid fields in this struct.
* The values from `enum nvtxPayloadSchemaAttributes` have to be used.
*/
uint64_t fieldMask;
/**
* Name of the enum. (Optional)
*/
const char* name;
/**
* Entries of the enum. (Mandatory)
*/
const nvtxPayloadEnum_t* entries;
/**
* Number of entries in the enum. (Mandatory)
*/
size_t numEntries;
/**
* Size of enumeration type in bytes
*/
size_t sizeOfEnum;
/**
* Static/custom schema ID must be
* >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and
* < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START
*/
uint64_t schemaId;
} nvtxPayloadEnumAttr_t;
/**
* \brief Register an enumeration type with the payload extension.
*
* @param domain NVTX domain handle
* @param attr NVTX payload enumeration type attributes.
*/
NVTX_DECLSPEC uint64_t nvtxPayloadEnumRegister(nvtxDomainHandle_t domain,
const nvtxPayloadEnumAttr_t* attr);
/**
* \brief Callback Ids of API functions in the payload extension.
*
* The NVTX handler can use these values to register a handler function. When
* InitializeInjectionNvtxExtension(nvtxExtModuleInfo_t* moduleInfo) is
* executed, a handler routine 'handlenvtxPayloadRegisterSchema' can be
* registered as follows:
* moduleInfo->segments->slots[NVTX3EXT_CBID_nvtxPayloadSchemaRegister] =
* (intptr_t)handlenvtxPayloadRegisterSchema;
*/
typedef enum NvtxExtPayloadCallbackId
{
NVTX3EXT_CBID_nvtxPayloadSchemaRegister = 0,
NVTX3EXT_CBID_nvtxPayloadEnumRegister = 1,
NVTX3EXT_CBID_PAYLOAD_FN_NUM = 2
} NvtxExtPayloadCallbackId;
#ifdef __GNUC__
#pragma GCC visibility push(internal)
#endif
#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot include directly */
#include "nvtxExtDetail/nvtxExtTypes.h"
#undef NVTX_EXT_TYPES_GUARD
#ifndef NVTX_NO_IMPL
#define NVTX_EXT_IMPL_PAYLOAD_GUARD /* Ensure other headers cannot included directly */
#include "nvtxExtDetail/nvtxExtPayloadTypeInfo.h"
#include "nvtxExtDetail/nvtxExtImplPayload_v1.h"
#undef NVTX_EXT_IMPL_PAYLOAD_GUARD
#endif /*NVTX_NO_IMPL*/
#ifdef __GNUC__
#pragma GCC visibility pop
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif /* NVTOOLSEXT_PAYLOAD_H */
+8 -8
Просмотреть файл
@@ -16,10 +16,10 @@ extern "C" {
typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(hipStream_t stream, const char* name);
typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(hipStream_t stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(hipEvent_t event, const char* name);
typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(hipEvent_t event, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
{
@@ -39,7 +39,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(hipStream_t stream, const char* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
@@ -48,7 +48,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(hipStream_t stream, const char*
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_t* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
@@ -57,7 +57,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(hipEvent_t event, const char* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
@@ -66,7 +66,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(hipEvent_t event, const char* nam
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(hipEvent_t event, const wchar_t* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
+16 -16
Просмотреть файл
@@ -15,16 +15,16 @@
extern "C" {
#endif /* __cplusplus */
typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(hipDevice_t device, const char* name);
typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(hipDevice_t device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(hipCtx_t context, const char* name);
typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(hipCtx_t context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(hipStream_t stream, const char* name);
typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(hipStream_t stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(hipEvent_t event, const char* name);
typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(hipEvent_t event, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(hipDevice_t device, const char* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
@@ -33,7 +33,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(hipDevice_t device, const char* na
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
@@ -42,7 +42,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t*
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(hipCtx_t context, const char* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
@@ -51,7 +51,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(hipCtx_t context, const char* nam
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
@@ -60,7 +60,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t*
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(hipStream_t stream, const char* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
@@ -69,7 +69,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(hipStream_t stream, const char* na
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
@@ -78,7 +78,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t*
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(hipEvent_t event, const char* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
@@ -87,7 +87,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(hipEvent_t event, const char* name)
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(hipEvent_t event, const wchar_t* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
+1 -1
Просмотреть файл
@@ -18,7 +18,7 @@
/* ------ Dependency-free types binary-compatible with real types ------- */
/* In order to avoid having the NVTX core API headers depend on non-NVTX
* headers like hip/hip_runtime.h, NVTX defines binary-compatible types to use for
* headers like cuda.h, NVTX defines binary-compatible types to use for
* safely making the initialization versions of all NVTX functions without
* needing to have definitions for the real types. */
+93
Просмотреть файл
@@ -0,0 +1,93 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_IMPL_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
#ifndef NVTX_EXT_IMPL_H
#define NVTX_EXT_IMPL_H
/* ---- Include required platform headers ---- */
#if defined(_WIN32)
#include <Windows.h>
#else
#include <unistd.h>
#if defined(__ANDROID__)
#include <android/api-level.h>
#endif
#if defined(__linux__) || defined(__CYGWIN__)
#include <sched.h>
#endif
#include <limits.h>
#include <dlfcn.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <pthread.h>
#include <stdlib.h>
#include <wchar.h>
#endif
/* ---- Define macros used in this file ---- */
#ifdef NVTX_DEBUG_PRINT
#ifdef __ANDROID__
#include <android/log.h>
#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
#else
#include <stdio.h>
#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
#endif
#else /* !defined(NVTX_DEBUG_PRINT) */
#define NVTX_ERR(...)
#define NVTX_INFO(...)
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
// #ifdef __GNUC__
// #pragma GCC visibility push(hidden)
// #endif
#define NVTX_EXTENSION_FRESH 0
#define NVTX_EXTENSION_DISABLED 1
#define NVTX_EXTENSION_STARTING 2
#define NVTX_EXTENSION_LOADED 3
NVTX_LINKONCE_DEFINE_GLOBAL NvtxExtInitializeInjectionFunc_t NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = (NvtxExtInitializeInjectionFunc_t)0;
#define NVTX_EXT_INIT_GUARD
#include "nvtxExtInit.h"
#undef NVTX_EXT_INIT_GUARD
// #ifdef __GNUC__
// #pragma GCC visibility pop
// #endif
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
#endif /* NVTX_EXT_IMPL_H */
+85
Просмотреть файл
@@ -0,0 +1,85 @@
/*
* Copyright 2021 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
#endif
#define NVTX_EXT_IMPL_GUARD
#include "nvtxExtImpl.h"
#undef NVTX_EXT_IMPL_GUARD
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
NAME##_v##VERSION##_mem##COMPATID
#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_PAYLOAD)
/*
* Function slots for the binary payload extension. First entry is the module
* state, initialized to `0` (`NVTX_EXTENSION_FRESH`).
*/
NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM + 1]
= {0};
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
{
nvtxExtModuleSegment_t segment = {
0, // unused (only one segment)
NVTX3EXT_CBID_PAYLOAD_FN_NUM,
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1
};
nvtxExtModuleInfo_t module = {
NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
NVTX_EXT_MODULEID_PAYLOAD, NVTX_EXT_COMPATID_PAYLOAD,
1, &segment, // number of segments, segments
NULL, // no export function needed
// bake type sizes and alignment information into program binary
&nvtxExtPayloadTypeInfo
};
NVTX_INFO( "%s\n", __FUNCTION__ );
NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
}
#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
typedef ret_val ( * fn_name##_impl_fntype )signature; \
NVTX_LINKONCE_DEFINE_FUNCTION ret_val fn_name signature { \
intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
if (slot != NVTX_EXTENSION_DISABLED) { \
if (slot) { \
return (*(fn_name##_impl_fntype)slot) arg_names; \
} else { \
NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
if (slot != NVTX_EXTENSION_DISABLED && slot) { \
return (*(fn_name##_impl_fntype)slot) arg_names; \
} \
} \
} \
return ((ret_val)(intptr_t)-1); \
}
NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadSchemaRegister, (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), (domain, attr))
NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadEnumRegister, (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), (domain, attr))
#undef NVTX_EXT_FN_IMPL
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
+363
Просмотреть файл
@@ -0,0 +1,363 @@
/*
* Copyright 2009-2020 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#ifndef NVTX_EXT_INIT_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* ---- Platform-independent helper definitions and functions ---- */
/* Prefer macros over inline functions to reduce symbol resolution at link time */
#if defined(_WIN32)
#define NVTX_PATHCHAR wchar_t
#define NVTX_STR(x) L##x
#define NVTX_GETENV _wgetenv
#define NVTX_BUFSIZE MAX_PATH
#define NVTX_DLLHANDLE HMODULE
#define NVTX_DLLOPEN(x) LoadLibraryW(x)
#define NVTX_DLLFUNC GetProcAddress
#define NVTX_DLLCLOSE FreeLibrary
#define NVTX_YIELD() SwitchToThread()
#define NVTX_MEMBAR() MemoryBarrier()
#define NVTX_ATOMIC_WRITE_32(address, value) InterlockedExchange((volatile LONG*)address, value)
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand)
#define NVTX_ATOMIC_WRITE_PTR(address, value) InterlockedExchangePointer((volatile PVOID*)address, (PVOID)value)
#define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) old = (intptr_t)InterlockedCompareExchangePointer((volatile PVOID*)address, (PVOID)exchange, (PVOID)comparand)
#elif defined(__GNUC__)
#define NVTX_PATHCHAR char
#define NVTX_STR(x) x
#define NVTX_GETENV getenv
#define NVTX_BUFSIZE PATH_MAX
#define NVTX_DLLHANDLE void*
#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
#define NVTX_DLLFUNC dlsym
#define NVTX_DLLCLOSE dlclose
#define NVTX_YIELD() sched_yield()
#define NVTX_MEMBAR() __sync_synchronize()
/* Ensure full memory barrier for atomics, to match Windows functions */
#define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
#define NVTX_ATOMIC_WRITE_PTR(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value)
#define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
#else
#error The library does not support your configuration!
#endif
/* Define this to 1 for platforms that where pre-injected libraries can be discovered. */
#if defined(_WIN32)
/* TODO */
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
#else
#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
#endif
/* Define this to 1 for platforms that support environment variables */
/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
/* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
#define NVTX_SUPPORT_ENV_VARS 1
/* Define this to 1 for platforms that support dynamic/shared libraries */
#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
/* Injection libraries implementing InitializeInjectionNvtxExtension may be statically linked,
* and this will override any dynamic injection. Useful for platforms where dynamic
* injection is not available. Since weak symbols not explicitly marked extern are
* guaranteed to be initialized to zero if no definitions are found by the linker, the
* dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
/* To statically inject an NVTX library, define InitializeInjectionNvtxExtension_fnptr as a normal
* symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension (which
* does not need to be named "InitializeInjectionNvtxExtension" as is necessary in a dynamic
* injection library. */
__attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxExtension_fnptr;
#else
#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
#endif
/* This function tries to find or load an NVTX injection library and get the
* address of its InitializeInjectionExtension function. If such a function pointer
* is found, it is called, and passed the address of this NVTX instance's
* nvtxGetExportTable function, so the injection can attach to this instance.
* If the initialization fails for any reason, any dynamic library loaded will
* be freed, and all NVTX implementation functions will be set to no-ops. If
* initialization succeeds, NVTX functions not attached to the tool will be set
* to no-ops. This is implemented as one function instead of several small
* functions to minimize the number of weak symbols the linker must resolve.
* Order of search is:
* - Pre-injected library exporting InitializeInjectionNvtxExtension
* - Loadable library exporting InitializeInjectionNvtxExtension
* - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
* - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
* - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
*/
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr);
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr)
{
const char* const initFuncName = "InitializeInjectionNvtxExtension";
NvtxExtInitializeInjectionFunc_t init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
if(out_init_fnptr){
*out_init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
}
#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
/* Use POSIX global symbol chain to query for init function from any module */
init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(0, initFuncName);
#endif
#if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY
/* Try discovering dynamic injection library to load */
if (!init_fnptr)
{
#if NVTX_SUPPORT_ENV_VARS
/* If env var NVTX_INJECTION64_PATH is set, it should contain the path
* to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
? NVTX_STR("NVTX_INJECTION32_PATH")
: NVTX_STR("NVTX_INJECTION64_PATH");
#endif /* NVTX_SUPPORT_ENV_VARS */
NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
/* Refer to this variable explicitly in case all references to it are #if'ed out */
(void)injectionLibraryPathBuf;
#if NVTX_SUPPORT_ENV_VARS
/* Disable the warning for getenv & _wgetenv -- this usage is safe because
* these functions are not called again before using the returned value. */
#if defined(_MSC_VER)
#pragma warning( push )
#pragma warning( disable : 4996 )
#endif
injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName);
#if defined(_MSC_VER)
#pragma warning( pop )
#endif
#endif
#if defined(__ANDROID__)
if (!injectionLibraryPath)
{
const char *bits = (sizeof(void*) == 4) ? "32" : "64";
char cmdlineBuf[32];
char pkgName[PATH_MAX];
int count;
int pid;
FILE *fp;
size_t bytesRead;
size_t pos;
pid = (int)getpid();
count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid);
if (count <= 0 || count >= (int)sizeof(cmdlineBuf))
{
NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
fp = fopen(cmdlineBuf, "r");
if (!fp)
{
NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp);
fclose(fp);
if (bytesRead == 0)
{
NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
pkgName[bytesRead] = 0;
/* String can contain colon as a process separator. In this case the package name is before the colon. */
pos = 0;
while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
{
++pos;
}
pkgName[pos] = 0;
count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits);
if (count <= 0 || count >= NVTX_BUFSIZE)
{
NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
/* On Android, verify path is accessible due to aggressive file access restrictions. */
/* For dlopen, if the filename contains a leading slash, then it is interpreted as a */
/* relative or absolute pathname; otherwise it will follow the rules in ld.so. */
if (injectionLibraryPathBuf[0] == '/')
{
#if (__ANDROID_API__ < 21)
int access_err = access(injectionLibraryPathBuf, F_OK | R_OK);
#else
int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0);
#endif
if (access_err != 0)
{
NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf);
return NVTX_ERR_INIT_ACCESS_LIBRARY;
}
}
injectionLibraryPath = injectionLibraryPathBuf;
}
#endif
/* At this point, injectionLibraryPath is specified if a dynamic
* injection library was specified by a tool. */
if (injectionLibraryPath)
{
/* Load the injection library */
injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath);
if (!injectionLibraryHandle)
{
NVTX_ERR("Failed to load injection library\n");
return NVTX_ERR_INIT_LOAD_LIBRARY;
}
else
{
/* Attempt to get the injection library's entry-point */
init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
if (!init_fnptr)
{
NVTX_DLLCLOSE(injectionLibraryHandle);
NVTX_ERR("Failed to get address of function %s from injection library\n", initFuncName);
return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT;
}
}
}
}
#endif
#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
if (!init_fnptr)
{
/* Check weakly-defined function pointer. A statically-linked injection can define this as
* a normal symbol and it will take precedence over a dynamic injection. */
if (InitializeInjectionNvtxExtension_fnptr)
{
init_fnptr = InitializeInjectionNvtxExtension_fnptr;
}
}
#endif
if(out_init_fnptr){
*out_init_fnptr = init_fnptr;
}
/* At this point, if init_fnptr is not set, then no tool has specified
* an NVTX injection library -- return non-success result so all NVTX
* API functions will be set to no-ops. */
if (!init_fnptr)
{
return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
}
return NVTX_SUCCESS;
}
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
nvtxExtModuleInfo_t* moduleInfo,
intptr_t* moduleState
)
{
intptr_t old;
NVTX_INFO( "%s\n", __FUNCTION__ );
if( *moduleState == NVTX_EXTENSION_LOADED) {
return;
}
NVTX_ATOMIC_CAS_PTR(
old,
moduleState,
NVTX_EXTENSION_STARTING,
NVTX_EXTENSION_FRESH);
if (old == NVTX_EXTENSION_FRESH)
{
NvtxExtInitializeInjectionFunc_t init_fnptr = NVTX_VERSIONED_IDENTIFIER(injectionFnPtr);
int entryPointStatus = 0;
int forceAllToNoops = 0;
/* Load & initialize injection library -- it will assign the function pointers */
if(init_fnptr == 0){
int result = 0;
/* try to load vanilla NVTX first*/
nvtxInitialize(0);
result = NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(&init_fnptr);
/*at this point init_fnptr will be either 0 or a real function*/
if(result == NVTX_SUCCESS) {
NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = init_fnptr;
}
else {
NVTX_ERR("Failed to load injection library\n");
}
}
if(init_fnptr != 0) {
/* Invoke injection library's initialization function. If it returns
* 0 (failure) and a dynamic injection was loaded, unload it. */
entryPointStatus = init_fnptr(moduleInfo);
if (entryPointStatus == 0) {
NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
}
}
/* Clean up any functions that are still uninitialized so that they are skipped.
* Set all to null if injection init function failed as well.
*/
forceAllToNoops = (init_fnptr == 0) || (entryPointStatus == 0);
for(size_t s = 0; s < moduleInfo->segmentsCount; ++s){
nvtxExtModuleSegment_t* segment = moduleInfo->segments+s;
for(size_t i = 0; i < segment->slotCount; ++i){
if(forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH)){
segment->functionSlots[i] = NVTX_EXTENSION_DISABLED;
}
}
}
NVTX_MEMBAR();
/* Signal that initialization has finished, so now the assigned function pointers will be used */
NVTX_ATOMIC_WRITE_PTR(
moduleState,
NVTX_EXTENSION_LOADED);
}
else /* Spin-wait until initialization has finished */
{
NVTX_MEMBAR();
while (*moduleState != NVTX_EXTENSION_LOADED)
{
NVTX_YIELD();
NVTX_MEMBAR();
}
}
}
#ifdef __cplusplus
}
#endif /* __cplusplus */
+128
Просмотреть файл
@@ -0,0 +1,128 @@
#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
#endif
/*
* Helper array to get the alignment for each predefined C language type.
*/
typedef void* pointer_type;
#if __STDC_VERSION__ >= 201112L /* or CPP11 */
#include <stdalign.h>
#define nvtx_alignof(type) alignof(type)
#define nvtx_alignof2(type,tname) alignof(type)
#else /* __STDC_VERSION__ >= 201112L */
#ifndef __cplusplus
#include <stddef.h>
#define nvtx_alignof(type) offsetof(struct {char c; type d;}, d)
#define nvtx_alignof2(type,tname) nvtx_alignof(type)
#else /* __cplusplus */
#define MKTYPEDEF(TYPE) typedef struct {char c; TYPE d;} _nvtx_##TYPE
#define MKTYPEDEF2(TYPE,TNAME) typedef struct {char c; TYPE d;} _nvtx_##TNAME
#define nvtx_alignof(TNAME) offsetof(_nvtx_##TNAME, d)
#define nvtx_alignof2(type,tname) offsetof(_nvtx_##tname, d)
MKTYPEDEF(char);
MKTYPEDEF2(unsigned char, uchar);
MKTYPEDEF(short);
MKTYPEDEF2(unsigned short, ushort);
MKTYPEDEF(int);
MKTYPEDEF2(unsigned int, uint);
MKTYPEDEF(long);
MKTYPEDEF2(unsigned long, ulong);
MKTYPEDEF2(long long, longlong);
MKTYPEDEF2(unsigned long long, ulonglong);
MKTYPEDEF(int8_t);
MKTYPEDEF(uint8_t);
MKTYPEDEF(int16_t);
MKTYPEDEF(uint16_t);
MKTYPEDEF(int32_t);
MKTYPEDEF(uint32_t);
MKTYPEDEF(int64_t);
MKTYPEDEF(uint64_t);
MKTYPEDEF(float);
MKTYPEDEF(double);
MKTYPEDEF2(long double, longdouble);
MKTYPEDEF(size_t);
MKTYPEDEF(pointer_type);
MKTYPEDEF(wchar_t);
#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
{sizeof(char8_t), nvtx_alignof(char8_t)},
MKTYPEDEF(char8_t);
#endif
#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
MKTYPEDEF(char16_t);
MKTYPEDEF(char32_t);
#endif
#undef MKTYPEDEF
#undef MKTYPEDEF2
#endif /* __cplusplus */
#endif /* __STDC_VERSION__ >= 201112L */
/*
* The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
*/
const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
{
/* The first entry contains this array's length and the size of each entry in this array. */
{NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE, sizeof(nvtxPayloadEntryTypeInfo_t)},
/*** C integer types ***/
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR */ {sizeof(char), nvtx_alignof(char)},
/* NVTX_PAYLOAD_ENTRY_TYPE_UCHAR */ {sizeof(unsigned char), nvtx_alignof2(unsigned char, uchar)},
/* NVTX_PAYLOAD_ENTRY_TYPE_SHORT */ {sizeof(short), nvtx_alignof(short)},
/* NVTX_PAYLOAD_ENTRY_TYPE_USHORT */ {sizeof(unsigned short), nvtx_alignof2(unsigned short, ushort)},
/* NVTX_PAYLOAD_ENTRY_TYPE_INT */ {sizeof(int), nvtx_alignof(int)},
/* NVTX_PAYLOAD_ENTRY_TYPE_UINT */ {sizeof(unsigned int), nvtx_alignof2(unsigned int, uint)},
/* NVTX_PAYLOAD_ENTRY_TYPE_LONG */ {sizeof(long), nvtx_alignof(long)},
/* NVTX_PAYLOAD_ENTRY_TYPE_ULONG */ {sizeof(unsigned long), nvtx_alignof2(unsigned long, ulong)},
/* NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG */ {sizeof(long long), nvtx_alignof2(long long, longlong)},
/* NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG */ {sizeof(unsigned long long), nvtx_alignof2(unsigned long long,ulonglong)},
/*** Integer types with explicit size ***/
/* NVTX_PAYLOAD_ENTRY_TYPE_INT8 */ {sizeof(int8_t), nvtx_alignof(int8_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_UINT8 */ {sizeof(uint8_t), nvtx_alignof(uint8_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_INT16 */ {sizeof(int16_t), nvtx_alignof(int16_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_UINT16 */ {sizeof(uint16_t), nvtx_alignof(uint16_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_INT32 */ {sizeof(int32_t), nvtx_alignof(int32_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_UINT32 */ {sizeof(uint32_t), nvtx_alignof(uint32_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_INT64 */ {sizeof(int64_t), nvtx_alignof(int64_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_UINT64 */ {sizeof(uint64_t), nvtx_alignof(uint64_t)},
/*** C floating point types ***/
/* NVTX_PAYLOAD_ENTRY_TYPE_FLOAT */ {sizeof(float), nvtx_alignof(float)},
/* NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE */ {sizeof(double), nvtx_alignof(double)},
/* NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE */ {sizeof(long double), nvtx_alignof2(long double, longdouble)},
/* NVTX_PAYLOAD_ENTRY_TYPE_SIZE */ {sizeof(size_t), nvtx_alignof(size_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(pointer_type), nvtx_alignof(pointer_type)},
/*** Special character types ***/
/* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */
#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
{sizeof(char8_t), nvtx_alignof(char8_t)},
#else
{0, 0},
#endif
#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)},
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)}
#else
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {0, 0},
/* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {0, 0}
#endif
};
#undef nvtx_alignof
#undef nvtx_alignof2
+44
Просмотреть файл
@@ -0,0 +1,44 @@
/*
* Copyright 2021 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
/* This header defines types which are used by the internal implementation
* of NVTX and callback subscribers. API clients do not use these types,
* so they are defined here instead of in nvToolsExt.h to clarify they are
* not part of the NVTX client API. */
#ifndef NVTXEXTTYPES_H
#define NVTXEXTTYPES_H
#ifndef NVTX_EXT_TYPES_GUARD
#error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h.
#endif
typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId);
typedef struct nvtxExtModuleSegment_t
{
size_t segmentId;
size_t slotCount;
intptr_t* functionSlots;
} nvtxExtModuleSegment_t;
typedef struct nvtxExtModuleInfo_t
{
uint16_t nvtxVer;
uint16_t structSize;
uint16_t moduleId;
uint16_t compatId;
size_t segmentsCount;
nvtxExtModuleSegment_t* segments;
NvtxExtGetExportFunction_t getExportFunction;
const void* extInfo;
} nvtxExtModuleInfo_t;
typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo);
#endif /* NVTXEXTTYPES_H */
+5
Просмотреть файл
@@ -7,9 +7,14 @@
#ifndef NCCL_NVTX_STUB_H_
#define NCCL_NVTX_STUB_H_
#include <nvtx3/nvToolsExtPayload.h>
struct nccl_domain{static constexpr char const* name{"NCCL"};};
#define NVTX3_FUNC_RANGE_IN(domain)
#define nvtxNameOsThreadA(syscall, thread)
#define NVTX3_FUNC_WITH_PARAMS(ID, S, P)
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11
#endif
+9 -25
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -30,32 +30,16 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
#define RCCL_PARAM_DECLARE(name) \
int64_t rcclParam##name()
#define RCCL_PARAM(name, env, default_value) \
#define RCCL_PARAM(name, env, deftVal) \
pthread_mutex_t rcclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
int64_t rcclParam##name() { \
static_assert(default_value != -1LL, "default value cannot be -1"); \
static int64_t value = -1LL; \
int64_t localValue; \
pthread_mutex_lock(&rcclParamMutex##name); \
localValue = value; \
char* en = getenv("RCCL_TEST_ENV_VARS"); \
if (value == -1LL || (en && (strcmp(en, "ENABLE") == 0))){ \
value = default_value; \
char* str = getenv("RCCL_" env); \
if (str && strlen(str) > 0) { \
errno = 0; \
int64_t v = strtoll(str, NULL, 0); \
if (errno) { \
INFO(NCCL_ALL,"Invalid value %s for %s, using default %lu.", str, "RCCL_" env, value); \
} else { \
value = v; \
INFO(NCCL_ALL,"%s set by environment to %lu.", "RCCL_" env, value); \
} \
constexpr int64_t uninitialized = INT64_MIN; \
static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
static int64_t cache = uninitialized; \
if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
ncclLoadParam("RCCL_" env, deftVal, uninitialized, &cache); \
} \
localValue = value; \
} \
pthread_mutex_unlock(&rcclParamMutex##name); \
return localValue; \
}
return cache; \
}
#endif
+14 -3
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
@@ -13,6 +13,7 @@
#include "info.h"
#include "socket.h"
#include <pthread.h>
#include "shm.h"
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
@@ -116,6 +117,7 @@ struct ncclProxyOpsPool {
struct ncclProxyOps {
ncclProxyOpsPool* pool;
ncclShmHandle_t handle;
int count;
int freeOp;
int nextOps;
@@ -155,6 +157,7 @@ struct ncclProxyPool;
struct ncclProxyProgressState {
// Used by main threads to send work to progress thread
struct ncclProxyOpsPool* opsPool;
ncclShmHandle_t handle;
char opsPoolShmSuffix[6];
pthread_t thread;
@@ -174,7 +177,6 @@ struct ncclProxyState {
struct ncclSocket* listenSock;
int stop;
CUcontext cudaCtx;
int safeAbortFlag;
// Used by main thread
union ncclSocketAddress* peerAddresses;
@@ -186,6 +188,15 @@ struct ncclProxyState {
struct ncclProxyProgressState progressState;
};
enum proxyConnectState {
connUninitialized = 0,
connInitialized = 1,
connSharedInitialized = 2,
connSetupDone = 3,
connConnected = 4,
numConnStates = 5
};
struct ncclProxyConnection {
int send, transport, shared;
int localRank;
@@ -194,7 +205,7 @@ struct ncclProxyConnection {
struct ncclProxyArgs *proxyAppend;
struct ncclProxyArgs **proxyAppendPtr;
void* transportResources;
bool initFlag;
proxyConnectState state;
};
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
+5 -3
Просмотреть файл
@@ -9,7 +9,9 @@
#include "nccl.h"
ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create);
ncclResult_t ncclShmUnlink(const char* shmname);
ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize);
typedef void* ncclShmHandle_t;
ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
ncclResult_t ncclShmClose(ncclShmHandle_t handle);
ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
#endif
+38 -11
Просмотреть файл
@@ -21,6 +21,7 @@
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
#define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
/* Common socket address storage structure for IPv4/IPv6 */
union ncclSocketAddress {
@@ -30,32 +31,59 @@ union ncclSocketAddress {
};
enum ncclSocketState {
ncclSocketConnecting = 0,
ncclSocketConnected = 1,
ncclSocketError = 2,
ncclSocketStateNum = 3
} ;
ncclSocketStateNone = 0,
ncclSocketStateInitialized = 1,
ncclSocketStateAccepting = 2,
ncclSocketStateAccepted = 3,
ncclSocketStateConnecting = 4,
ncclSocketStateConnectPolling = 5,
ncclSocketStateConnected = 6,
ncclSocketStateReady = 7,
ncclSocketStateClosed = 8,
ncclSocketStateError = 9,
ncclSocketStateNum = 10
};
enum ncclSocketType {
ncclSocketTypeUnknown = 0,
ncclSocketTypeBootstrap = 1,
ncclSocketTypeProxy = 2,
ncclSocketTypeNetSocket = 3,
ncclSocketTypeNetIb = 4
};
struct ncclSocket {
int fd;
int acceptFd;
int timedOutRetries;
int refusedRetries;
union ncclSocketAddress addr;
volatile uint32_t* abortFlag;
int asyncFlag;
enum ncclSocketState state;
int salen;
uint64_t magic;
enum ncclSocketType type;
};
const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
// Initialize a socket
ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
ncclResult_t ncclSocketListen(struct ncclSocket* sock);
ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
// Connect to sock->addr. sock->fd is set after a successful call.
ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse = 0);
// Return socket connection state.
ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state);
// Accept an incoming connection from listenSocket->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket);
ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running);
// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock);
ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd);
ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);
#define NCCL_SOCKET_SEND 0
#define NCCL_SOCKET_RECV 1
@@ -65,6 +93,5 @@ ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed);
/* initialize a socket. */
ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
ncclResult_t ncclSocketClose(struct ncclSocket* sock);
#endif
+6 -3
Просмотреть файл
@@ -98,16 +98,19 @@ ncclResult_t ncclStrongStreamLaunchKernel(
);
// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus
// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the
// implementation to induce few graph dependencies.
ncclResult_t ncclStrongStreamWaitStream(
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false
);
// `b` must be capturing within `graph`.
ncclResult_t ncclStrongStreamWaitStream(
struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b
struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false
);
// `a` must be capturing within `graph`.
ncclResult_t ncclStrongStreamWaitStream(
struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b
struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false
);
// Synchrnoization does not need the strong stream to be acquired.
+14
Просмотреть файл
@@ -27,6 +27,7 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
uint64_t getHash(const char* string, int n);
uint64_t getHostHash();
uint64_t getPidHash();
ncclResult_t getRandomData(void* buffer, size_t bytes);
struct netIf {
char prefix[64];
@@ -48,6 +49,19 @@ inline uint64_t clockNano() {
return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
}
/* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
* return -1 */
inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
ncclResult_t ret = ncclSuccess;
if (bytes > 0) {
const size_t one = 1UL;
FILE* fp = fopen("/dev/urandom", "r");
if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) ret = ncclSystemError;
if (fp) fclose(fp);
}
return ret;
}
////////////////////////////////////////////////////////////////////////////////
template<typename Int>
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+26
Просмотреть файл
@@ -0,0 +1,26 @@
#include "nccl.h"
#include "nvtx.h"
static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
{"Sum", ncclSum},
{"Product", ncclProd},
{"Max", ncclMax},
{"Min", ncclMin},
{"Avg", ncclAvg}
};
// Must be called before the first call to any reduction operation.
void initNvtxRegisteredEnums() {
// Register schemas and strings
constexpr const nvtxPayloadEnumAttr_t eAttr {
.fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES |
NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID,
.name = NULL,
.entries = NvtxEnumRedSchema,
.numEntries = std::extent<decltype(NvtxEnumRedSchema)>::value,
.sizeOfEnum = sizeof(ncclRedOp_t),
.schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP
};
nvtxPayloadEnumRegister(nvtx3::domain::get<nccl_domain>(), &eAttr);
}
+1 -1
Просмотреть файл
@@ -87,7 +87,7 @@ static void initOnceFunc() {
cudaLib = dlopen(path, RTLD_LAZY);
if (cudaLib == NULL) {
WARN("Failed to find CUDA library in %s (NCCL_CUDA_PATH=%s)", ncclCudaPath, ncclCudaPath);
WARN("Failed to find CUDA library (NCCL_CUDA_PATH='%s') : %s", ncclCudaPath ? ncclCudaPath : "", dlerror());
goto error;
}
+133 -60
Просмотреть файл
@@ -15,79 +15,152 @@
#include <stdlib.h>
#include <unistd.h>
// Change functions behavior to match other SYS functions
static int shm_allocate(int fd, const int shmSize) {
int err = posix_fallocate(fd, 0, shmSize);
if (err) { errno = err; return -1; }
return 0;
}
static int shm_map(int fd, const int shmSize, void** ptr) {
*ptr = mmap(NULL, shmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
return (*ptr == MAP_FAILED) ? -1 : 0;
struct shmHandleInternal {
int fd;
char* shmPath;
char* shmPtr;
void* devShmPtr;
size_t shmSize;
size_t realShmSize;
int* refcount;
};
static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmSize, char* hptr, void* dptr, bool create, struct shmHandleInternal* handle) {
handle->fd = fd;
handle->shmPtr = hptr;
handle->devShmPtr = dptr;
handle->shmSize = shmSize;
handle->realShmSize = realShmSize;
handle->refcount = (int*)(hptr + shmSize);
if (create) {
int slen = strlen(shmPath);
handle->shmPath = (char*)malloc(slen + 1);
memcpy(handle->shmPath, shmPath, slen + 1);
if (hptr) memset(hptr, 0, shmSize);
} else {
handle->shmPath = NULL;
}
return;
}
static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void** ptr, int create) {
ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) {
int fd = -1;
char* hptr = NULL;
void* dptr = NULL;
ncclResult_t ret = ncclSuccess;
struct shmHandleInternal* tmphandle;
bool create = refcount > 0 ? true : false;
const size_t refSize = sizeof(int); /* extra sizeof(int) bytes for reference count */
const size_t realShmSize = shmSize + refSize;
*handle = *shmPtr = NULL; /* assume shmPtr and handle always set correctly by users. */
EQCHECKGOTO(tmphandle = (struct shmHandleInternal*)malloc(sizeof(struct shmHandleInternal)), NULL, ret, fail);
if (create) {
/* refcount > 0 means the caller tries to allocate a shared memory. This shared memory segment will have
* refcount references; when the peer attaches, it should pass -1 to reduce one reference count. When it
* goes down to 0, unlink should be called in order to delete shared memory file. */
if (shmPath[0] == '\0') {
sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
*fd = mkstemp(shmPath);
fd = mkstemp(shmPath);
} else {
SYSCHECKVAL(open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
}
if (ftruncate(*fd, shmSize) != 0) {
WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize);
return ncclSystemError;
if (ftruncate(fd, realShmSize) != 0) {
WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize);
ret = ncclSystemError;
goto fail;
}
INFO(NCCL_ALLOC, "Allocated %d bytes of shared memory in %s\n", shmSize, shmPath);
INFO(NCCL_ALLOC, "Allocated %ld bytes of shared memory in %s", realShmSize, shmPath);
} else {
SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
}
*ptr = (char*)mmap(NULL, shmSize, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0);
if (*ptr == NULL) {
WARN("Could not map %s\n", shmPath);
return ncclSystemError;
}
close(*fd);
*fd = -1;
if (create) memset(*ptr, 0, shmSize);
return ncclSuccess;
}
ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create) {
int fd = -1;
void* ptr = MAP_FAILED;
ncclResult_t res = ncclSuccess;
NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError);
if (devShmPtr) {
CUDACHECKGOTO(cudaHostRegister(ptr, shmSize, cudaHostRegisterMapped), res, cudaError);
CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
}
*shmPtr = ptr;
return ncclSuccess;
sysError:
WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize);
cudaError:
if (fd != -1) close(fd);
if (create) shm_unlink(shmPath);
if (ptr != MAP_FAILED) munmap(ptr, shmSize);
*shmPtr = NULL;
return res;
}
hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (hptr == MAP_FAILED) {
WARN("Could not map %s size %zi, error: %s", shmPath, realShmSize, strerror(errno));
ret = ncclSystemError;
goto fail;
}
ncclResult_t ncclShmUnlink(const char* shmPath) {
if (shmPath != NULL) SYSCHECK(unlink(shmPath), "unlink");
return ncclSuccess;
}
if (create) {
*(int*)(hptr + shmSize) = refcount;
} else {
int remref = __atomic_sub_fetch((int*)(hptr + shmSize), 1, __ATOMIC_RELAXED);
if (remref == 0) {
/* the last peer has completed attachment, it should unlink the shm mem file. */
if (unlink(shmPath) != 0) {
WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno));
}
}
ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) {
if (shmPtr) {
if (devShmPtr) CUDACHECK(cudaHostUnregister(shmPtr));
if (munmap(shmPtr, shmSize) != 0) {
WARN("munmap of shared memory failed");
return ncclSystemError;
if (refcount != -1) {
WARN("attaching memory should only reduce refcount by 1 but %d is passed", refcount);
}
}
return ncclSuccess;
if (devShmPtr) {
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterMapped), ret, fail);
CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail);
}
shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
exit:
*shmPtr = hptr;
if (devShmPtr) *devShmPtr = dptr;
*handle = (ncclShmHandle_t)tmphandle;
return ret;
fail:
WARN("Error while %s shared memory segment %s (size %ld)", create ? "creating" : "attaching to", shmPath, shmSize);
if (tmphandle) {
shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
ncclShmClose((ncclShmHandle_t)tmphandle);
tmphandle = NULL;
}
hptr = NULL;
dptr = NULL;
goto exit;
}
ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
ncclResult_t ret = ncclSuccess;
struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle;
if (tmphandle) {
if (tmphandle->fd >= 0) {
close(tmphandle->fd);
if (tmphandle->shmPath != NULL && *tmphandle->refcount > 0) {
if (unlink(tmphandle->shmPath) != 0) {
WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
ret = ncclSystemError;
}
free(tmphandle->shmPath);
}
}
if (tmphandle->shmPtr) {
if (tmphandle->devShmPtr) CUDACHECK(cudaHostUnregister(tmphandle->shmPtr));
if (munmap(tmphandle->shmPtr, tmphandle->realShmSize) != 0) {
WARN("munmap of shared memory %p size %ld failed, error: %s", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno));
ret = ncclSystemError;
}
}
free(tmphandle);
}
return ret;
}
ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
ncclResult_t ret = ncclSuccess;
struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle;
if (tmphandle) {
if (tmphandle->shmPath != NULL) {
if (unlink(tmphandle->shmPath) != 0) {
WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
ret = ncclSystemError;
}
free(tmphandle->shmPath);
tmphandle->shmPath = NULL;
}
}
return ret;
}
+431 -177
Просмотреть файл
@@ -20,6 +20,52 @@
static std::vector<std::pair<int, std::unordered_set<std::string>>> clientPortPool;
static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
int bytes = 0;
*closed = 0;
char* data = (char*)ptr;
char line[SOCKET_NAME_MAXLEN+1];
do {
if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
if (op == NCCL_SOCKET_RECV && bytes == 0) {
*closed = 1;
return ncclSuccess;
}
if (bytes == -1) {
if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
WARN("socketProgressOpt: Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
return ncclRemoteError;
} else {
bytes = 0;
}
}
(*offset) += bytes;
if (sock->abortFlag && *sock->abortFlag != 0) {
INFO(NCCL_NET, "socketProgressOpt: abort called");
return ncclInternalError;
}
} while (bytes > 0 && (*offset) < size);
return ncclSuccess;
}
static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
int closed;
NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
if (closed) {
char line[SOCKET_NAME_MAXLEN+1];
WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
return ncclRemoteError;
}
return ncclSuccess;
}
static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
while (*offset < size)
NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
return ncclSuccess;
}
/* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
*
* Output: "IPv4/IPv6 address<port>"
@@ -202,7 +248,7 @@ int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAd
return found;
}
ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
WARN("Net : string is null");
return ncclInvalidArgument;
@@ -304,7 +350,7 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
// Try to find interface that is in the same subnet as the IP in comm id
union ncclSocketAddress idAddr;
ncclGetSocketAddrFromString(&idAddr, commId);
ncclSocketGetAddrFromString(&idAddr, commId);
nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
}
}
@@ -318,39 +364,31 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa
}
ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
/* IPv4/IPv6 support */
int family = sock->addr.sa.sa_family;
int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
int flags;
/* Create socket and bind it to a port */
int fd = socket(family, SOCK_STREAM, 0);
if (fd == -1) {
WARN("Net : Socket creation failed : %s", strerror(errno));
return ncclSystemError;
if (sock == NULL) {
WARN("ncclSocketListen: pass NULL socket");
return ncclInvalidArgument;
}
if (sock->fd == -1) {
WARN("ncclSocketListen: file descriptor is -1");
return ncclInvalidArgument;
}
if (socketToPort(&sock->addr)) {
// Port is forced by env. Make sure we get the port.
int opt = 1;
#if defined(SO_REUSEPORT)
SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
#else
SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
#endif
}
/* The socket is set non-blocking for OS level, but asyncFlag is used to control
* blocking and non-blocking behavior in user level. */
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
// addr port should be 0 (Any port)
SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
SYSCHECK(bind(sock->fd, &sock->addr.sa, sock->salen), "bind");
/* Get the assigned Port */
socklen_t size = salen;
SYSCHECK(getsockname(fd, &sock->addr.sa, &size), "getsockname");
socklen_t size = sock->salen;
SYSCHECK(getsockname(sock->fd, &sock->addr.sa, &size), "getsockname");
#ifdef ENABLE_TRACE
char line[SOCKET_NAME_MAXLEN+1];
@@ -360,76 +398,226 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
/* Put the socket in listen mode
* NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
*/
SYSCHECK(listen(fd, 16384), "listen");
sock->fd = fd;
SYSCHECK(listen(sock->fd, 16384), "listen");
sock->state = ncclSocketStateReady;
return ncclSuccess;
}
static ncclResult_t getFdState(int fd, enum ncclSocketState* state) {
struct pollfd pfd;
int timeout = 1, ret;
socklen_t rlen = sizeof(int);
memset(&pfd, 0, sizeof(struct pollfd));
pfd.fd = fd;
pfd.events = POLLOUT;
SYSCHECK(ret = poll(&pfd, 1, timeout), "poll");
if (ret == 0) {
ret = EINPROGRESS;
} else {
/* check socket status */
EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
}
if (ret == EINPROGRESS || ret == ECONNREFUSED)
*state = ncclSocketConnecting;
else if (ret == 0)
*state = ncclSocketConnected;
else
*state = ncclSocketError;
return ncclSuccess;
ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr) {
if (sock == NULL) {
WARN("ncclSocketGetAddr: pass NULL socket");
return ncclInvalidArgument;
}
if (sock->state != ncclSocketStateReady) return ncclInternalError;
memcpy(addr, &sock->addr, sizeof(union ncclSocketAddress));
return ncclSuccess;
}
ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state) {
NCCLCHECK(getFdState(sock->fd, state));
sock->state = *state;
static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
socklen_t socklen = sizeof(union ncclSocketAddress);
sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen);
if (sock->fd != -1) {
sock->state = ncclSocketStateAccepted;
} else if (errno != EAGAIN && errno != EWOULDBLOCK) {
WARN("socketTryAccept: get errno %d that is not EAGAIN or EWOULDBLOCK", errno);
return ncclSystemError;
}
return ncclSuccess;
}
static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
uint64_t magic;
enum ncclSocketType type;
int received = 0;
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
if (received == 0) return ncclSuccess;
NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
if (magic != sock->magic) {
WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
close(sock->fd);
sock->fd = -1;
// Ignore spurious connection and accept again
sock->state = ncclSocketStateAccepting;
return ncclSuccess;
} else {
received = 0;
NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
if (type != sock->type) {
WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type);
sock->state = ncclSocketStateError;
close(sock->fd);
sock->fd = -1;
return ncclInternalError;
} else {
sock->state = ncclSocketStateReady;
}
}
return ncclSuccess;
}
static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
/* blocking/non-blocking connect() is determined by asyncFlag. */
int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
if (ret == 0) {
sock->state = ncclSocketStateConnected;
return ncclSuccess;
} else if (errno == EINPROGRESS) {
sock->state = ncclSocketStateConnectPolling;
return ncclSuccess;
} else if (errno == ECONNREFUSED) {
if (++sock->refusedRetries == RETRY_REFUSED_TIMES) {
sock->state = ncclSocketStateError;
WARN("socketStartConnect: exceeded retries (%d)", sock->refusedRetries);
return ncclRemoteError;
}
usleep(SLEEP_INT);
if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
return ncclSuccess;
} else if (errno == ETIMEDOUT) {
if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
sock->state = ncclSocketStateError;
WARN("socketStartConnect: exceeded timeouts (%d)", sock->timedOutRetries);
return ncclRemoteError;
}
usleep(SLEEP_INT);
return ncclSuccess;
} else {
char line[SOCKET_NAME_MAXLEN+1];
sock->state = ncclSocketStateError;
WARN("socketStartConnect: Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
return ncclSystemError;
}
}
static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
struct pollfd pfd;
int timeout = 1, ret;
socklen_t rlen = sizeof(int);
memset(&pfd, 0, sizeof(struct pollfd));
pfd.fd = sock->fd;
pfd.events = POLLOUT;
SYSCHECK(ret = poll(&pfd, 1, timeout), "poll");
if (ret == 0) return ncclSuccess;
/* check socket status */
EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
if (ret == 0) {
sock->state = ncclSocketStateConnected;
} else if (ret == ECONNREFUSED) {
if (++sock->refusedRetries == RETRY_REFUSED_TIMES) {
sock->state = ncclSocketStateError;
WARN("socketPollConnect: exceeded retries (%d)", sock->refusedRetries);
return ncclRemoteError;
}
if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
usleep(SLEEP_INT);
sock->state = ncclSocketStateConnecting;
} else if (ret == ETIMEDOUT) {
if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
sock->state = ncclSocketStateError;
WARN("socketPollConnect: exceeded timeouts (%d)", sock->timedOutRetries);
return ncclRemoteError;
}
usleep(SLEEP_INT);
sock->state = ncclSocketStateConnecting;
} else if (ret != EINPROGRESS) {
sock->state = ncclSocketStateError;
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
if (sock == NULL) {
WARN("ncclSocketPollConnect: pass NULL socket");
return ncclInvalidArgument;
}
NCCLCHECK(socketPollConnect(sock));
return ncclSuccess;
}
static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) {
int sent = 0;
NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
if (sent == 0) return ncclSuccess;
NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
sent = 0;
NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
sock->state = ncclSocketStateReady;
return ncclSuccess;
}
static ncclResult_t socketProgressState(struct ncclSocket* sock) {
if (sock->state == ncclSocketStateAccepting) {
NCCLCHECK(socketTryAccept(sock));
}
if (sock->state == ncclSocketStateAccepted) {
NCCLCHECK(socketFinalizeAccept(sock));
}
if (sock->state == ncclSocketStateConnecting) {
NCCLCHECK(socketStartConnect(sock));
}
if (sock->state == ncclSocketStateConnectPolling) {
NCCLCHECK(socketPollConnect(sock));
}
if (sock->state == ncclSocketStateConnected) {
NCCLCHECK(socketFinalizeConnect(sock));
}
return ncclSuccess;
}
ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) {
if (sock == NULL) {
*running = 0;
return ncclSuccess;
}
if (sock->state == ncclSocketStateError || sock->state == ncclSocketStateClosed) {
WARN("ncclSocketReady: unexpected socket state %d", sock->state);
return ncclRemoteError;
}
*running = (sock->state == ncclSocketStateReady) ? 1 : 0;
if (*running == 0) {
NCCLCHECK(socketProgressState(sock));
*running = (sock->state == ncclSocketStateReady) ? 1 : 0;
}
return ncclSuccess;
}
ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse) {
char line[SOCKET_NAME_MAXLEN+1];
/* IPv4/IPv6 support */
int family = sock->addr.sa.sa_family;
if (family != AF_INET && family != AF_INET6) {
WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
const int one = 1;
if (sock == NULL) {
WARN("ncclSocketConnect: pass NULL socket");
return ncclInvalidArgument;
}
if (sock->fd == -1) {
WARN("ncclSocketConnect: file descriptor is -1");
return ncclInvalidArgument;
}
if (sock->state != ncclSocketStateInitialized) {
WARN("ncclSocketConnect: wrong socket state %d", sock->state);
if (sock->state == ncclSocketStateError) return ncclRemoteError;
return ncclInternalError;
}
int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
int flags;
TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
/* Connect to a hostname / port */
int fd = socket(family, SOCK_STREAM, 0);
if (fd == -1) {
WARN("Net : Socket creation failed : %s", strerror(errno));
return ncclSystemError;
}
const int one = 1;
SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
/* The socket is set non-blocking for OS level, but asyncFlag is used to control
* blocking and non-blocking behavior in user level. */
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
/* const int bufsize = 128*1024;
SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
if (portReuse) {
// pre-define ports according to tid, to avoid extra lock for race condition
int family = sock->addr.sa.sa_family;
if (family != AF_INET && family != AF_INET6) {
WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
return ncclInternalError;
}
int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); // pre-define ports according to tid, to avoid extra lock for race condition
if (clientPortPool.size() == 0) {
for (int tid = syscall(SYS_gettid), i = 1; i < 5; i++) {
clientPortPool.push_back(std::make_pair(60000 + i * 1000 + tid % 1000, std::unordered_set<std::string>()));
@@ -448,161 +636,227 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse) {
// bind the port in fd for connect system call
if (reused_port != -1) {
int opt = 1;
SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
struct sockaddr_in sin;
sin.sin_family = family;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
sin.sin_port = htons(reused_port);
SYSCHECK(bind(fd, (struct sockaddr *)&sin, salen), "bind_client_port");
SYSCHECK(bind(sock->fd, (struct sockaddr *)&sin, salen), "bind_client_port");
}
}
TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
sock->state = ncclSocketStateConnecting;
do {
NCCLCHECK(socketProgressState(sock));
} while (sock->asyncFlag == 0 &&
(sock->abortFlag == NULL || *sock->abortFlag == 0) &&
(sock->state == ncclSocketStateConnecting ||
sock->state == ncclSocketStateConnectPolling ||
sock->state == ncclSocketStateConnected));
int ret;
int timedout_retries = 0;
int refused_retries = 0;
retry:
/* blocking/non-blocking connect() is determined by asyncFlag. */
ret = connect(fd, &sock->addr.sa, salen);
if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
if (!sock->asyncFlag) {
/* blocking socket, need retry if connect fails. */
if (errno == EINPROGRESS || errno == EAGAIN || errno == EALREADY ||
(errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
/* check abortFlag as long as we have chance to retry. */
if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
usleep(SLEEP_INT);
goto retry;
}
/* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again.
* However, it can return EISCONN instead of success which indicates connection is built up in
* background already. No need to call connect() again. */
if (ret == 0 || errno == EISCONN) {
sock->fd = fd;
switch (sock->state) {
case ncclSocketStateConnecting:
case ncclSocketStateConnectPolling:
case ncclSocketStateConnected:
case ncclSocketStateReady:
return ncclSuccess;
}
} else {
sock->fd = fd;
return ncclSuccess;
case ncclSocketStateError:
return ncclSystemError;
default:
WARN("ncclSocketConnect: wrong socket state %d", sock->state);
return ncclInternalError;
}
WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
return ncclRemoteError;
}
ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
socklen_t socklen = sizeof(union ncclSocketAddress);
struct pollfd pollfd;
int tmpFd = sock->fd = -1;
int pollret;
ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSock) {
ncclResult_t ret = ncclSuccess;
pollfd.fd = listenSocket->fd;
pollfd.events = POLLIN;
retry:
if ((pollret = poll(&pollfd, 1, listenSocket->asyncFlag ? 0 : 100)) < 0) {
return ncclSystemError;
} else {
tmpFd = accept(listenSocket->fd, &sock->addr.sa, &socklen);
if (listenSock == NULL || sock == NULL) {
WARN("ncclSocketAccept: pass NULL socket");
ret = ncclInvalidArgument;
goto exit;
}
if (listenSock->state != ncclSocketStateReady) {
WARN("ncclSocketAccept: wrong socket state %d", listenSock->state);
if (listenSock->state == ncclSocketStateError)
ret = ncclSystemError;
else
ret = ncclInternalError;
goto exit;
}
if (!listenSocket->asyncFlag) {
/* blocking socket, if tmpFd is still -1, we need to retry */
if (tmpFd == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
if (listenSocket->abortFlag && *listenSocket->abortFlag != 0) return ncclInternalError;
goto retry;
}
EQCHECK(tmpFd, -1);
if (sock->acceptFd == -1) {
memcpy(sock, listenSock, sizeof(struct ncclSocket));
sock->acceptFd = listenSock->fd;
sock->state = ncclSocketStateAccepting;
}
sock->fd = tmpFd;
return ncclSuccess;
do {
NCCLCHECKGOTO(socketProgressState(sock), ret, exit);
} while (sock->asyncFlag == 0 &&
(sock->abortFlag == NULL || *sock->abortFlag == 0) &&
(sock->state == ncclSocketStateAccepting ||
sock->state == ncclSocketStateAccepted));
if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
switch (sock->state) {
case ncclSocketStateAccepting:
case ncclSocketStateAccepted:
case ncclSocketStateReady:
ret = ncclSuccess;
break;
case ncclSocketStateError:
ret = ncclSystemError;
break;
default:
WARN("ncclSocketAccept: wrong socket state %d", sock->state);
ret = ncclInternalError;
break;
}
exit:
return ret;
}
ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, volatile uint32_t* abortFlag, int asyncFlag) {
if (sock == NULL)
return ncclSuccess;
ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag) {
ncclResult_t ret = ncclSuccess;
if (sock == NULL) goto exit;
sock->timedOutRetries = 0;
sock->refusedRetries = 0;
sock->abortFlag = abortFlag;
sock->asyncFlag = asyncFlag;
sock->state = ncclSocketStateInitialized;
sock->magic = magic;
sock->type = type;
sock->fd = -1;
sock->acceptFd = -1;
if (addr) {
/* IPv4/IPv6 support */
int family;
memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress));
family = sock->addr.sa.sa_family;
if (family != AF_INET && family != AF_INET6) {
char line[SOCKET_NAME_MAXLEN+1];
WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
ret = ncclInternalError;
goto fail;
}
sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
/* Connect to a hostname / port */
sock->fd = socket(family, SOCK_STREAM, 0);
if (sock->fd == -1) {
WARN("ncclSocketInit: Socket creation failed : %s", strerror(errno));
ret = ncclSystemError;
goto fail;
}
} else {
memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
}
sock->abortFlag = abortFlag;
sock->asyncFlag = asyncFlag;
sock->state = ncclSocketStateNum;
return ncclSuccess;
}
static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
int bytes = 0;
*closed = 0;
char* data = (char*)ptr;
char line[SOCKET_NAME_MAXLEN+1];
do {
if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
if (op == NCCL_SOCKET_RECV && bytes == 0) {
*closed = 1;
return ncclSuccess;
}
if (bytes == -1) {
if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
return ncclRemoteError;
} else {
bytes = 0;
}
}
(*offset) += bytes;
if (sock->abortFlag && *sock->abortFlag != 0) {
INFO(NCCL_NET, "Socket progress: abort called");
return ncclInternalError;
}
} while (bytes > 0 && (*offset) < size);
return ncclSuccess;
/* Set socket as non-blocking if async or if we need to be able to abort */
if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
int flags;
EQCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), -1, ret, fail);
SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), ret, fail);
}
exit:
return ret;
fail:
goto exit;
}
ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
int closed;
NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
if (closed) {
char line[SOCKET_NAME_MAXLEN+1];
WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
return ncclRemoteError;
if (sock == NULL) {
WARN("ncclSocketProgress: pass NULL socket");
return ncclInvalidArgument;
}
NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
return ncclSuccess;
}
ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
while (*offset < size)
NCCLCHECK(ncclSocketProgress(op, sock, ptr, size, offset));
if (sock == NULL) {
WARN("ncclSocketWait: pass NULL socket");
return ncclInvalidArgument;
}
NCCLCHECK(socketWait(op, sock, ptr, size, offset));
return ncclSuccess;
}
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) {
int offset = 0;
NCCLCHECK(ncclSocketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
if (sock == NULL) {
WARN("ncclSocketSend: pass NULL socket");
return ncclInvalidArgument;
}
if (sock->state != ncclSocketStateReady) {
WARN("ncclSocketSend: socket state (%d) is not ready", sock->state);
return ncclInternalError;
}
NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
return ncclSuccess;
}
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
int offset = 0;
NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
if (sock == NULL) {
WARN("ncclSocketRecv: pass NULL socket");
return ncclInvalidArgument;
}
if (sock->state != ncclSocketStateReady) {
WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state);
return ncclInternalError;
}
NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
return ncclSuccess;
}
// Receive or detect connection closed
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) {
int offset = 0;
if (sock == NULL) {
WARN("ncclSocketTryRecv: pass NULL socket");
return ncclInvalidArgument;
}
*closed = 0;
while (offset < size) {
NCCLCHECK(ncclSocketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
if (*closed) return ncclSuccess;
}
return ncclSuccess;
}
ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
if (sock != NULL) {
if (sock->fd >= 0) close(sock->fd);
sock->state = ncclSocketStateClosed;
sock->fd = -1;
}
return ncclSuccess;
}
ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd) {
if (sock == NULL) {
WARN("ncclSocketGetFd: pass NULL socket");
return ncclInvalidArgument;
}
if (fd) *fd = sock->fd;
return ncclSuccess;
}
ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock) {
if (sock == NULL) {
WARN("ncclSocketGetFd: pass NULL socket");
return ncclInvalidArgument;
}
sock->fd = fd;
return ncclSuccess;
}
+11 -4
Просмотреть файл
@@ -305,7 +305,8 @@ static void mergeTips(struct ncclStrongStreamGraph* a, cudaGraphNode_t const* bN
}
ncclResult_t ncclStrongStreamWaitStream(
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b,
bool b_subsumes_a
) {
#if CUDART_VERSION >= 11030
if (graph.graph == nullptr) {
@@ -319,6 +320,7 @@ ncclResult_t ncclStrongStreamWaitStream(
NCCLCHECK(checkGraphId(ag, graph.graphId));
struct ncclStrongStreamGraph* bg = b->graphHead;
NCCLCHECK(checkGraphId(bg, graph.graphId));
if (b_subsumes_a) ag->tipCount = 0;
mergeTips(ag, bg->tipNodes, bg->tipCount);
}
a->serialEventNeedsRecord = true;
@@ -330,7 +332,8 @@ ncclResult_t ncclStrongStreamWaitStream(
}
ncclResult_t ncclStrongStreamWaitStream(
struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b
struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b,
bool b_subsumes_a
) {
#if CUDART_VERSION >= 11030
if (graph.graph == nullptr) {
@@ -351,6 +354,7 @@ ncclResult_t ncclStrongStreamWaitStream(
}
struct ncclStrongStreamGraph* ag = a->graphHead;
NCCLCHECK(checkGraphId(ag, graph.graphId));
if (b_subsumes_a) ag->tipCount = 0;
mergeTips(ag, bNodes, bCount);
}
a->serialEventNeedsRecord = true;
@@ -362,7 +366,8 @@ ncclResult_t ncclStrongStreamWaitStream(
}
ncclResult_t ncclStrongStreamWaitStream(
struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b
struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b,
bool b_subsumes_a
) {
#if CUDART_VERSION >= 11030
if (graph.graph == nullptr) {
@@ -374,7 +379,9 @@ ncclResult_t ncclStrongStreamWaitStream(
} else {
struct ncclStrongStreamGraph* bg = b->graphHead;
NCCLCHECK(checkGraphId(bg, graph.graphId));
CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount, cudaStreamAddCaptureDependencies));
CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount,
b_subsumes_a ? cudaStreamSetCaptureDependencies : cudaStreamAddCaptureDependencies
));
}
#else
CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
+2 -2
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
@@ -189,7 +189,7 @@ const char* pncclGetErrorString(ncclResult_t result);
*/
const char* ncclGetLastError(ncclComm_t comm);
/// @cond include_hidden
const char* pncclGetError(ncclComm_t comm);
const char* pncclGetLastError(ncclComm_t comm);
/// @endcond
/* Checks whether the comm has encountered any asynchronous errors */
+122 -77
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
@@ -442,8 +442,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
int stepSize = info->comm->buffSizes[op->protocol]/NCCL_STEPS;
// If nNodes > 1 and we're using Simple, reduce the stepSize to increase shared buffer utilization
if (info->comm->nNodes > 1 && op->protocol == NCCL_PROTO_SIMPLE) stepSize = info->comm->p2pNetChunkSize;
if (op->protocol == NCCL_PROTO_SIMPLE) stepSize = info->comm->p2pChunkSize;
info->chunkSize = stepSize;
op->root = info->root;
@@ -537,6 +536,8 @@ static ncclResult_t progressOps(struct ncclComm* comm, struct ncclProxyProgressS
return ncclSuccess;
}
NCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16);
static ncclResult_t ncclProxyGetPostedOps(struct ncclComm* comm, int* added) {
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
if (state->opsPool == NULL) return ncclInternalError;
@@ -575,9 +576,16 @@ process_nextops:
int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
for (int i=0; i<comm->localRanks; i++) freeOp[i] = -1;
uint64_t lastOpCount = 0;
int lastPeer = -1;
int count = 0;
for (int opIndex = state->nextOps; opIndex != -1;) {
struct ncclProxyOp* peerOp = pool->ops+opIndex;
int peer = opIndex / MAX_OPS_PER_PEER;
if ((lastOpCount && peerOp->opCount != lastOpCount) || ((lastPeer != -1) && peer != lastPeer)) count++;
if (count == ncclParamProxyAppendBatchSize()+1) break;
lastOpCount = peerOp->opCount;
lastPeer = peer;
if (peerOp->connection == NULL) return ncclInternalError;
if (peerOp->next != -1) __builtin_prefetch(pool->ops+peerOp->next);
NCCLCHECK(ProxyAppend(state, peerOp));
@@ -681,7 +689,7 @@ void* ncclProxyProgress(void *comm_) {
int lastIdle = 0;
struct ncclProxyArgs profArgs; // Only used for profiling purposes
while (state->stop == 0 && *comm->abortFlag == 0) {
while ((state->stop == false || (state->stop == true && state->active)) && *comm->abortFlag == 0) {
int idle = 1;
ncclResult_t ret = progressOps(comm, state, state->active, &idle);
if (ret != ncclSuccess) {
@@ -691,18 +699,17 @@ void* ncclProxyProgress(void *comm_) {
}
if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
if (idle) {
int added = 0;
TIME_START(3);
int added = 0;
TIME_START(3);
if (state->stop == false)
ret = ncclProxyGetPostedOps(comm, &added);
if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
if (ret != ncclSuccess) {
(void) ncclCommSetAsyncError(comm, ret);
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
}
if (added == 0) {
sched_yield(); // No request progressed. Let others run.
}
if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
if (ret != ncclSuccess) {
(void) ncclCommSetAsyncError(comm, ret);
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
}
if (added == 0) {
sched_yield(); // No request progressed. Let others run.
}
lastIdle = idle;
}
@@ -819,7 +826,7 @@ static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* poo
int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE;
for (int i=0; i<max; i++) {
ncclProxyConnection *connection = pool->pools[b]+i;
if (connection->initFlag == true) {
if (connection->state != connUninitialized) {
NCCLCHECK(proxyFree(connection, comm));
}
}
@@ -832,6 +839,10 @@ static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* poo
#include "transport.h"
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn) {
struct ncclSocket* sock;
int ready;
int type = ncclProxyMsgInit;
// Keep one connection per mlocal rank
proxyConn->connection = NULL;
proxyConn->rank = rank;
@@ -839,17 +850,18 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
NCCLCHECK(ncclCalloc(&comm->proxyState.peerSocks, comm->localRanks));
NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks));
NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks));
for (int r=0; r<comm->localRanks; r++) {
NCCLCHECK(ncclSocketInit(&comm->proxyState.peerSocks[r], NULL, comm->abortFlag, 0));
for (int i = 0; i < comm->localRanks; ++i) {
NCCLCHECK(ncclSocketSetFd(-1, &comm->proxyState.peerSocks[i]));
}
}
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank));
struct ncclSocket* sock = comm->proxyState.peerSocks+proxyConn->localRank;
if (sock->fd == -1) {
memcpy(&sock->addr, comm->proxyState.peerAddresses+rank, sizeof(union ncclSocketAddress));
sock = comm->proxyState.peerSocks + proxyConn->localRank;
NCCLCHECK(ncclSocketReady(sock, &ready));
if (!ready) {
NCCLCHECK(ncclSocketInit(sock, comm->proxyState.peerAddresses+rank, comm->magic, ncclSocketTypeProxy, comm->abortFlag));
NCCLCHECK(ncclSocketConnect(sock));
}
int type = ncclProxyMsgInit;
NCCLCHECK(ncclSocketSend(sock, &type, sizeof(int)));
NCCLCHECK(ncclSocketSend(sock, &transport, sizeof(int)));
NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
@@ -862,7 +874,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
NCCLCHECK(ncclSocketRecv(sock, poolPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1));
struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps+proxyConn->localRank;
if (proxyOps->pool == NULL) {
NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0));
NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle));
proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
}
}
@@ -873,11 +885,12 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" };
ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError;
struct ncclSocket* sock = proxyConn->comm->proxyState.peerSocks+proxyConn->localRank;
if (sock->fd == -1) return ncclInternalError;
ncclResult_t ret;
struct ncclSocket* sock;
ncclResult_t ret = ncclSuccess;
if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError;
sock = proxyConn->comm->proxyState.peerSocks + proxyConn->localRank;
if (sock == NULL) return ncclInternalError;
NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
@@ -887,7 +900,6 @@ ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void*
return ncclSuccess;
error:
WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
sock->fd = -1;
return ret;
}
@@ -897,16 +909,15 @@ static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
int size = sizeof(struct ncclProxyOpsPool);
struct ncclProxyOpsPool* pool = NULL;
char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
shmPath[0] = '\0';
NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, 1));
// Init pool
pool->nextOps = -1;
// The service thread may be launched already but localRanks may not be set yet.
while (comm->localRanks == 0) sched_yield();
char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
shmPath[0] = '\0';
NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, comm->localRanks + 1, &state->handle));
// Init pool
pool->nextOps = -1;
for (int r=0; r<comm->localRanks; r++) {
pool->freeOps[r] = r*MAX_OPS_PER_PEER;
for (int i=0; i<MAX_OPS_PER_PEER-1; i++) pool->ops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1;
@@ -933,7 +944,7 @@ static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
static void proxyOpsFree(struct ncclComm* comm) {
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
if (ncclShmClose(state->opsPool, NULL, sizeof(struct ncclProxyOpsPool)) != ncclSuccess) {
if (ncclShmClose(state->handle) != ncclSuccess) {
WARN("[Service thread] shm close failed");
}
}
@@ -942,10 +953,8 @@ ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) {
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
if (state->opsPool == NULL) return ncclSuccess;
char shmPath[] = "/dev/shm/nccl-XXXXXX";
memcpy(shmPath+sizeof("/dev/shm/nccl-")-1, state->opsPoolShmSuffix, sizeof("XXXXXX")-1);
if (ncclShmUnlink(shmPath) != ncclSuccess) {
WARN("[Service thread] shm unlink failed");
if (ncclShmUnlink(state->handle) != ncclSuccess) {
WARN("[Service thread] proxy ops shm unlink failed");
}
return ncclSuccess;
}
@@ -970,7 +979,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
}
INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport);
__atomic_store_n(&connection->initFlag, true, __ATOMIC_RELEASE);
__atomic_store_n(&connection->state, connInitialized, __ATOMIC_RELEASE);
return ncclSuccess;
}
@@ -985,6 +994,7 @@ static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct
int nChannels;
NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
__atomic_store_n(&connection->state, connSharedInitialized, __ATOMIC_RELEASE);
return ncclSuccess;
}
@@ -996,14 +1006,29 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclC
NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
} else return ncclInternalError;
if (done) {
if (op->respSize) NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize));
if (op->reqBuff) free(op->reqBuff);
if (op->respBuff) free(op->respBuff);
op->reqBuff = NULL;
op->respBuff = NULL;
if (op->type == ncclProxyMsgSetup)
__atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE);
else if (op->type == ncclProxyMsgConnect)
__atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE);
/* if setup or connect is done, we should not return any error at this point since
* ncclSocketSend might already send the respBuff to the requester. If we still choose
* to abort and close the connection, it can cause segfault if the requester is using
* the respBuff. */
if (op->respSize) ncclSocketSend(op->connection->sock, op->respBuff, op->respSize);
if (op->reqBuff) {
free(op->reqBuff);
op->reqBuff = NULL;
}
if (op->respBuff) {
free(op->respBuff);
op->respBuff = NULL;
}
op->type = 0;
(*asyncOpCount)--;
} else if (*comm->abortFlag != 0) {
return ncclInternalError;
}
return ncclSuccess;
}
@@ -1047,36 +1072,52 @@ void* ncclProxyService(void* _args) {
struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
ncclSocketInit(&peers[s].sock, NULL, comm->abortFlag, 0);
pollfds[s].fd = -1;
pollfds[s].events = POLLHUP|POLLIN;
}
pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
if (ncclSocketGetFd(comm->proxyState.listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
WARN("[Proxy Service] Get listenSock fd fails\n");
return NULL;
};
pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
int maxnpeers = 0;
int npeers = 0;
int stop = 0;
int asyncOpCount = 0;
while ((stop == 0 || (stop == 1 && npeers > 0)) && *comm->abortFlag == 0) {
while (stop == 0 || (stop == 1 && npeers > 0)) {
/* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
* connections. Need to wait until all other related comms call abort and safely exit
* together, or we could face segmentation fault. */
if (*comm->abortFlag != 0) stop = 1;
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
if (poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500) < 0) {
WARN("[Proxy Service] Poll failed: %s\n", strerror(errno));
int ret;
do {
ret = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500);
} while (ret < 0 && errno == EINTR);
if (ret < 0) {
WARN("[Proxy Service] Poll failed: %s", strerror(errno));
return NULL;
}
if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
int s = 0;
while (s < NCCL_MAX_LOCAL_RANKS && peers[s].sock.fd != -1) s++;
while (s < NCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0) s++;
if (s == NCCL_MAX_LOCAL_RANKS) {
WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS);
return NULL;
}
if (maxnpeers < s+1) maxnpeers = s+1;
struct ncclSocket* sock = &peers[s].sock;
if (ncclSocketAccept(sock, comm->proxyState.listenSock) != ncclSuccess) {
if (ncclSocketInit(&peers[s].sock) != ncclSuccess) {
WARN("[Service thread] Initialize peers[%d].sock fails\n", s);
return NULL;
}
if (ncclSocketAccept(&peers[s].sock, comm->proxyState.listenSock) != ncclSuccess) {
WARN("[Service thread] Accept failed %s", strerror(errno));
} else {
pollfds[s].fd = sock->fd;
if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) {
WARN("[Service thread] Get peers[%d].sock fd fails\n", s);
return NULL;
}
npeers++;
peers[s].localRank = -1;
}
@@ -1088,10 +1129,12 @@ void* ncclProxyService(void* _args) {
int closeConn = 0;
int type = 0;
ncclResult_t res = ncclSuccess;
if (pollfds[s].fd == -1) continue;
if (op->type != 0) {
res = proxyProgressAsync(op, comm, &asyncOpCount);
type = op->type;
if (res != ncclSuccess) op->type = 0;
if (res != ncclSuccess) closeConn = 1;
} else if (pollfds[s].revents & POLLIN) {
int closed;
if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) {
@@ -1125,26 +1168,31 @@ void* ncclProxyService(void* _args) {
closeConn = 1;
}
if (closeConn) {
close(sock->fd);
sock->fd = pollfds[s].fd = -1;
ncclSocketClose(sock);
if (op->reqBuff) {
free(op->reqBuff);
op->reqBuff = NULL;
}
if (op->respBuff) {
free(op->respBuff);
op->respBuff = NULL;
}
op->type = 0;
pollfds[s].fd = -1;
npeers--;
}
}
}
/* wait until main thread flush all NCCL operations. */
while (*comm->abortFlag != 0 && __atomic_load_n(&comm->proxyState.safeAbortFlag, __ATOMIC_ACQUIRE) == 0)
usleep(1000);
// Wait for all operations to complete and stop progress thread before freeing any resource
if (ncclProxyProgressDestroy(comm) != ncclSuccess) {
WARN("[Proxy Service] proxyDestroy failed");
}
for (int s=0; s<maxnpeers; s++) {
if (peers[s].sock.fd != -1) close(peers[s].sock.fd);
ncclSocketClose(&peers[s].sock);
}
ncclProxyFreeConnections(&connectionPool, comm);
close(comm->proxyState.listenSock->fd);
free(comm->proxyState.listenSock);
ncclSocketClose(comm->proxyState.listenSock);
proxyOpsFree(comm);
return NULL;
}
@@ -1169,32 +1217,29 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
if (state->peerAddresses) {
if (*comm->abortFlag == 0) {
struct ncclSocket sock;
sock.abortFlag = NULL;
sock.asyncFlag = 0;
memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketConnect(&sock));
int type = ncclProxyMsgStop;
NCCLCHECK(ncclSocketInit(&sock, comm->proxyState.peerAddresses + comm->rank, comm->magic, ncclSocketTypeProxy, comm->abortFlag));
NCCLCHECK(ncclSocketConnect(&sock));
NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
close(sock.fd);
} else {
/* when abortFlag is set, all socket related communications are no longer reliable. We need to
* set a flag to let proxy thread exit. */
__atomic_store_n(&state->safeAbortFlag, 1, __ATOMIC_RELEASE);
NCCLCHECK(ncclSocketClose(&sock));
}
free(state->peerAddresses);
}
if (state->peerSocks) {
for (int i=0; i<comm->localRanks; i++) {
if (state->peerSocks[i].fd != -1) {
int fd;
NCCLCHECK(ncclSocketGetFd(state->peerSocks + i, &fd));
if (fd >= 0) {
if (state->proxyOps[i].pool) {
NCCLCHECK(ncclShmClose(state->proxyOps[i].pool, NULL, sizeof(struct ncclProxyOpsPool)));
NCCLCHECK(ncclShmClose(state->proxyOps[i].handle));
}
if (state->sharedDevMems[i]) {
CUDACHECK(cudaIpcCloseMemHandle(state->sharedDevMems[i]));
}
int type = ncclProxyMsgClose;
if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks+i, &type, sizeof(int)));
close(state->peerSocks[i].fd);
if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks + i, &type, sizeof(int)));
NCCLCHECK(ncclSocketClose(state->peerSocks + i));
}
}
free(state->peerSocks);
+25 -17
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -78,9 +78,12 @@ void dumpData(struct ncclConnect* data, int ndata) {
}
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
ncclResult_t ret = ncclSuccess;
int highestType = TRANSPORT_P2P; // track highest transport type
struct ncclConnect data[2*MAXCHANNELS];
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
for (int i=1; i<comm->nRanks; i++) {
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
@@ -94,7 +97,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
TIME_START(0);
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1UL<<c)) {
NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type));
NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
if (type > highestType) highestType = type;
}
}
@@ -103,7 +106,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclConnect* sendData = recvData+recvChannels;
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1UL<<c)) {
NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type));
NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
if (type > highestType) highestType = type;
}
}
@@ -112,16 +115,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
TIME_START(2);
if (sendPeer == recvPeer) {
if (recvChannels+sendChannels) {
NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
sendData = data;
recvData = data+sendChannels;
}
} else {
if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
}
TIME_STOP(2);
@@ -129,10 +132,10 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn), ret, fail);
conn->connected = 1;
CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sideStream));
CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sideStream));
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
}
}
TIME_STOP(3);
@@ -140,18 +143,23 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn), ret, fail);
conn->connected = 1;
CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sideStream));
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
}
}
TIME_STOP(4);
comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL;
}
CUDACHECK(cudaStreamSynchronize(comm->sideStream));
if (highestTransportType != NULL) *highestTransportType = highestType;
TIME_PRINT("P2P Setup/Connect");
return ncclSuccess;
exit:
NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->deviceStream, &comm->hostStream));
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->hostStream));
return ret;
fail:
goto exit;
}
extern struct ncclTransport collNetTransport;
+3 -3
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -445,7 +445,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) {
uint64_t *cpuPtr, *gpuPtr;
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc));
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, comm->sideStream));
resources->gdcSync = cpuPtr;
struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
@@ -513,7 +513,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
if (ncclGdrCopy) {
uint64_t *cpuPtr, *gpuPtr;
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc));
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, comm->sideStream));
if (ncclParamGdrCopySyncEnable()) {
resources->gdcSync = cpuPtr;
+59 -49
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
@@ -71,6 +71,7 @@ struct connectMapMem{
char shmPath[PATH_MAX];
cudaIpcMemHandle_t ipc;
};
ncclShmHandle_t handle;
};
struct connectMap {
@@ -247,13 +248,12 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
}
static ncclResult_t netMapShm(struct connectMapMem* mem) {
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, 0));
NCCLCHECK(ncclShmUnlink(mem->shmPath));
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, -1, &mem->handle));
return ncclSuccess;
}
static ncclResult_t netCreateShm(struct connectMapMem* mem) {
mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1));
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1, &mem->handle));
return ncclSuccess;
}
@@ -362,7 +362,7 @@ static ncclResult_t sendFree(struct ncclConnector* send) {
struct connectMap* map = (struct connectMap*)(send->transportResources);
if (map) {
if (map->sameProcess == 0) {
NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].handle));
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
}
@@ -395,7 +395,7 @@ static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int local
struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
state->refcount++;
if (state->size == 0) {
state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pNetChunkSize;
state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pChunkSize;
}
if (size) *size = state->size;
@@ -422,7 +422,7 @@ static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int local
static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) {
// Use different pools for different channels and also separate send/recv.
int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
*offset = comm->p2pNetChunkSize * globalSlot;
*offset = comm->p2pChunkSize * globalSlot;
return ncclSuccess;
}
@@ -547,6 +547,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
connection->proxyAppendPtr = &connection->proxyAppend;
}
if (resources->netSendComm == NULL) {
*done = 0;
return ncclSuccess;
@@ -605,7 +606,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
}
if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) {
uint64_t *cpuPtr, *gpuPtr;
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc));
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, comm->sideStream));
resources->gdcSync = cpuPtr;
struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
@@ -693,6 +694,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
connection->proxyAppendPtr = &connection->proxyAppend;
}
if (resources->netRecvComm == NULL) {
*done = 0;
return ncclSuccess;
@@ -741,7 +743,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
if (ncclGdrCopy && map->sameProcess) {
uint64_t *cpuPtr, *gpuPtr;
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc));
NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, comm->sideStream));
if (ncclParamGdrCopySyncEnable()) {
resources->gdcSync = cpuPtr;
@@ -794,67 +796,75 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
if (resources == NULL) { // NVB Preconnect
if (connection->state == connSharedInitialized) { // NVB Preconnect
NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0));
return ncclSuccess;
}
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->buffers[p]) {
NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
if (connection->state == connConnected) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->buffers[p]) {
NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
}
}
}
struct connectMapMem* mems = resources->map.mems;
if (resources->map.sameProcess) {
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
} else {
NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, NULL, mems[NCCL_NET_MAP_HOSTMEM].size));
}
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
if (resources->shared) {
NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0));
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
comms->sendRefCount[resources->channelId]--;
if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId]));
struct connectMapMem* mems = resources->map.mems;
if (resources->map.sameProcess) {
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
} else {
NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].handle));
}
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
if (resources->shared) {
NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0));
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
comms->sendRefCount[resources->channelId]--;
if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId]));
} else {
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
}
} else {
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
}
} else {
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
}
free(resources);
if (connection->state == connSetupDone) free(resources);
return ncclSuccess;
}
static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
if (resources == NULL) { // NVB Preconnect
if (connection->state == connSharedInitialized) { // NVB Preconnect
NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1));
return ncclSuccess;
}
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->buffers[p]) {
NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
if (connection->state == connConnected) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->buffers[p]) {
NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
}
}
}
struct connectMapMem* mems = resources->map.mems;
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
if (resources->shared) {
NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1));
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
comms->recvRefCount[resources->channelId]--;
if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId]));
struct connectMapMem* mems = resources->map.mems;
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
if (resources->shared) {
NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1));
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
comms->recvRefCount[resources->channelId]--;
if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId]));
} else {
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
}
} else {
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
}
} else {
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
}
free(resources);
if (connection->state == connSetupDone) free(resources);
return ncclSuccess;
}
+47 -41
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -59,6 +59,7 @@ struct alignas(64) ncclIbDev {
int realPort;
int maxQp;
struct ncclIbMrCache mrCache;
int ar; // ADAPTIVE_ROUTING
};
#define MAX_IB_PORT 15
@@ -82,6 +83,7 @@ NCCL_PARAM(IbSl, "IB_SL", 0);
NCCL_PARAM(IbTc, "IB_TC", 0);
NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
NCCL_PARAM(IbSockClientPortReuse, "IB_SOCK_CLIENT_PORT_REUSE", 0);
NCCL_PARAM(IbSockServerPortReuse, "IB_SOCK_SERVER_PORT_REUSE", 0);
@@ -228,6 +230,11 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
// Enable ADAPTIVE_ROUTING by default on IB networks
// But allow it to be overloaded by an env parameter
ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0;
if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
@@ -320,11 +327,6 @@ failure:
return ncclSystemError;
}
static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) {
memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
return ncclSuccess;
}
#define NCCL_NET_IB_MAX_RECVS 8
ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
@@ -386,6 +388,7 @@ struct ncclIbCommStage {
struct ncclIbHandle {
union ncclSocketAddress connectAddr; // Filled by the target
uint64_t magic; // random number to help debugging
struct ncclIbCommStage stage; // Used by the other side when connecting
};
@@ -398,7 +401,7 @@ struct ncclIbRequest {
struct ncclIbVerbs* verbs;
int type;
int events;
union ncclSocketAddress *addr;
struct ncclSocket* sock;
int nreqs;
union {
struct {
@@ -449,6 +452,7 @@ struct ncclIbSendComm {
struct ibv_qp* qps[NCCL_IB_MAX_QPS];
int nqps;
struct ibv_mr* fifoMr;
int ar;
};
// The SendFifo needs to be 32-byte aligned and each element needs
// to be a 32-byte multiple, so that an entry does not get split and
@@ -593,8 +597,8 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
memset(handle, 0, sizeof(struct ncclIbHandle));
comm->dev = dev;
comm->sock.asyncFlag = 1; /* nonblocking socket is required by network communication. */
NCCLCHECK(GetSocketAddr(&comm->sock.addr));
handle->magic = NCCL_SOCKET_MAGIC;
NCCLCHECK(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
if (ncclParamIbSockServerPortReuse()) {
// reuse the socket address and fd for listen system call
if (reusedSockfd == -1) {
@@ -608,16 +612,16 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
} else {
NCCLCHECK(ncclSocketListen(&comm->sock));
}
memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
*listenComm = comm;
return ncclSuccess;
}
ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
enum ncclSocketState conState;
struct ncclIbCommStage* stage = &handle->stage;
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
int ready;
*sendComm = NULL;
if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
@@ -628,20 +632,15 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
}
NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, NULL, 1));
NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
stage->comm = comm;
stage->state = ncclIbCommStateConnect;
NCCLCHECK(ncclSocketConnect(&comm->sock, ncclParamIbSockClientPortReuse()));
ib_connect_check:
/* since ncclSocketConnect is async, we must check if connection is complete */
NCCLCHECK(ncclGetSocketState(&comm->sock, &conState));
if (conState == ncclSocketConnecting) {
/* expect user to call again */
return ncclSuccess;
} else if (conState == ncclSocketError) {
return ncclRemoteError;
}
NCCLCHECK(ncclSocketReady(&comm->sock, &ready));
if (!ready) return ncclSuccess;
// IB Setup
struct ibv_context* ctx;
@@ -653,6 +652,7 @@ ib_connect_check:
for (int q=0; q<comm->nqps; q++) {
NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, comm->qps+q));
}
comm->ar = ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING
// Send my QP Info to receiver through the socket. Hope this won't block.
struct ibv_port_attr portAttr;
@@ -704,9 +704,10 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
struct ncclIbCommStage* stage = &lComm->stage;
struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
int ready;
*recvComm = NULL;
if (stage->state == ncclIbCommStateAccept) goto ib_accept;
if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
if (stage->state == ncclIbCommStateRecv) goto ib_recv;
if (stage->state == ncclIbCommStateSend) goto ib_send;
if (stage->state != ncclIbCommStateStart) {
@@ -717,12 +718,12 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
stage->comm = rComm;
stage->state = ncclIbCommStateAccept;
NCCLCHECK(ncclSocketInit(&rComm->sock, NULL, lComm->sock.abortFlag, 1));
ib_accept:
NCCLCHECK(ncclSocketInit(&rComm->sock));
NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
if (rComm->sock.fd == -1)
return ncclSuccess;
ib_accept_check:
NCCLCHECK(ncclSocketReady(&rComm->sock, &ready));
if (!ready) return ncclSuccess;
struct ncclIbQpInfo remQpInfo;
stage->state = ncclIbCommStateRecv;
@@ -825,7 +826,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest**
if (r->type == NCCL_NET_IB_REQ_UNUSED) {
r->verbs = verbs;
r->events = 1;
r->addr = NULL;
r->sock = NULL;
*req = r;
return ncclSuccess;
}
@@ -1000,8 +1001,8 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
}
struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
if (nreqs > 1 || reqs[0]->send.size > ncclParamIbArThreshold()) {
// When using adaptive routing, send the bulk of the data first as an
if (nreqs > 1 || (comm->ar && reqs[0]->send.size > ncclParamIbArThreshold())) {
// When using ADAPTIVE_ROUTING, send the bulk of the data first as an
// RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
// completion.
lastWr++;
@@ -1067,28 +1068,31 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
// Sanity checks to catch user collective call count/size mismatches
if (size > slots[r].size) {
char line[SOCKET_NAME_MAXLEN+1];
char line[SOCKET_NAME_MAXLEN + 1];
union ncclSocketAddress addr;
ncclSocketGetAddr(&comm->sock, &addr);
WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error, local size %d remote size %d",
r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size);
r, nreqs, tag, ncclSocketToString(&addr, line), size, slots[r].size);
return ncclInvalidUsage;
} // plus any potential programming errors
else if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
char line[SOCKET_NAME_MAXLEN+1];
WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x",
r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), slots[r].size, slots[r].addr, slots[r].rkey);
char line[SOCKET_NAME_MAXLEN + 1];
union ncclSocketAddress addr;
ncclSocketGetAddr(&comm->sock, &addr);
WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x",
r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkey);
return ncclInternalError;
}
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
req->type = NCCL_NET_IB_REQ_SEND;
req->addr = &comm->sock.addr;
req->sock = &comm->sock;
req->verbs = &comm->verbs;
req->nreqs = nreqs;
req->send.size = size;
req->send.data = data;
req->send.lkey = mr->lkey;
req->send.offset = 0;
req->addr = &comm->sock.addr;
req->events = comm->nqps;
*request = reqs[r] = req;
@@ -1181,7 +1185,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
req->type = NCCL_NET_IB_REQ_RECV;
req->addr = &comm->sock.addr;
req->sock = &comm->sock;
req->nreqs = n;
for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
@@ -1220,7 +1224,7 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
req->type = NCCL_NET_IB_REQ_FLUSH;
req->addr = &comm->sock.addr;
req->sock = &comm->sock;
struct ibv_mr* mr = (struct ibv_mr*)mhandles[last];
struct ibv_send_wr wr;
@@ -1268,8 +1272,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
struct ibv_wc *wc = wcs+w;
if (wc->status != IBV_WC_SUCCESS) {
char line[SOCKET_NAME_MAXLEN+1];
union ncclSocketAddress addr;
ncclSocketGetAddr(r->sock, &addr);
WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
return ncclRemoteError;
}
@@ -1301,7 +1307,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
ncclResult_t ncclIbCloseSend(void* sendComm) {
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
if (comm) {
close(comm->sock.fd);
NCCLCHECK(ncclSocketClose(&comm->sock));
for (int q=0; q<comm->nqps; q++)
if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
@@ -1315,7 +1321,7 @@ ncclResult_t ncclIbCloseSend(void* sendComm) {
ncclResult_t ncclIbCloseRecv(void* recvComm) {
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
if (comm) {
close(comm->sock.fd);
if (!ncclParamIbSockServerPortReuse() || reusedSockfd != comm->sock.fd) NCCLCHECK(ncclSocketClose(&comm->sock));
for (int q=0; q<comm->nqps; q++)
if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
if (comm->gpuFlush.enabled) {
@@ -1332,7 +1338,7 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
ncclResult_t ncclIbCloseListen(void* listenComm) {
struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm;
if (comm) {
if (!ncclParamIbSockServerPortReuse() || reusedSockfd != comm->sock.fd) close(comm->sock.fd);
NCCLCHECK(ncclSocketClose(&comm->sock));
free(comm);
}
return ncclSuccess;
+151 -166
Просмотреть файл
@@ -18,16 +18,16 @@
/* Init functions */
static int ncclNetIfs = -1;
struct ncclSocketDev {
struct ncclNetSocketDev {
union ncclSocketAddress addr;
char devName[MAX_IF_NAME_SIZE];
char* pciPath;
};
static struct ncclSocketDev ncclSocketDevs[MAX_IFS];
static struct ncclNetSocketDev ncclNetSocketDevs[MAX_IFS];
pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t ncclNetSocketLock = PTHREAD_MUTEX_INITIALIZER;
static ncclResult_t ncclSocketGetPciPath(char* devName, char** pciPath) {
static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) {
char devicePath[PATH_MAX];
snprintf(devicePath, PATH_MAX, "/sys/class/net/%s/device", devName);
// May return NULL if the file doesn't exist.
@@ -35,9 +35,9 @@ static ncclResult_t ncclSocketGetPciPath(char* devName, char** pciPath) {
return ncclSuccess;
}
ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) {
if (ncclNetIfs == -1) {
pthread_mutex_lock(&ncclSocketLock);
pthread_mutex_lock(&ncclNetSocketLock);
if (ncclNetIfs == -1) {
char names[MAX_IF_NAME_SIZE*MAX_IFS];
union ncclSocketAddress addrs[MAX_IFS];
@@ -52,9 +52,9 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
line[0] = '\0';
addrline[SOCKET_NAME_MAXLEN] = '\0';
for (int i=0; i<ncclNetIfs; i++) {
strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
strcpy(ncclNetSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
memcpy(&ncclNetSocketDevs[i].addr, addrs+i, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclNetSocketGetPciPath(ncclNetSocketDevs[i].devName, &ncclNetSocketDevs[i].pciPath));
snprintf(line+strlen(line), MAX_LINE_LEN-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
ncclSocketToString(&addrs[i], addrline));
}
@@ -62,17 +62,17 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
}
}
pthread_mutex_unlock(&ncclSocketLock);
pthread_mutex_unlock(&ncclNetSocketLock);
}
return ncclSuccess;
}
ncclResult_t ncclSocketDevices(int* ndev) {
ncclResult_t ncclNetSocketDevices(int* ndev) {
*ndev = ncclNetIfs;
return ncclSuccess;
}
static ncclResult_t ncclSocketGetSpeed(char* devName, int* speed) {
static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) {
*speed = 0;
char speedPath[PATH_MAX];
sprintf(speedPath, "/sys/class/net/%s/speed", devName);
@@ -91,12 +91,12 @@ static ncclResult_t ncclSocketGetSpeed(char* devName, int* speed) {
return ncclSuccess;
}
ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) {
props->name = ncclSocketDevs[dev].devName;
props->pciPath = ncclSocketDevs[dev].pciPath;
ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
props->name = ncclNetSocketDevs[dev].devName;
props->pciPath = ncclNetSocketDevs[dev].pciPath;
props->guid = dev;
props->ptrSupport = NCCL_PTR_HOST;
NCCLCHECK(ncclSocketGetSpeed(props->name, &props->speed));
NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed));
props->latency = 0; // Not set
props->port = 0;
props->maxComms = 65536;
@@ -104,12 +104,6 @@ ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) {
return ncclSuccess;
}
ncclResult_t GetSocketAddr(int dev, union ncclSocketAddress* addr) {
if (dev >= ncclNetIfs) return ncclInternalError;
memcpy(addr, &ncclSocketDevs[dev].addr, sizeof(*addr));
return ncclSuccess;
}
/* Communication functions */
#define MAX_SOCKETS 64
@@ -120,29 +114,30 @@ ncclResult_t GetSocketAddr(int dev, union ncclSocketAddress* addr) {
NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
enum ncclSocketCommState {
ncclSocketCommStateStart = 0,
ncclSocketCommStateConnect = 1,
ncclSocketCommStateAccept = 3,
ncclSocketCommStateSend = 4,
ncclSocketCommStateRecv = 5,
enum ncclNetSocketCommState {
ncclNetSocketCommStateStart = 0,
ncclNetSocketCommStateConnect = 1,
ncclNetSocketCommStateAccept = 3,
ncclNetSocketCommStateSend = 4,
ncclNetSocketCommStateRecv = 5,
};
struct ncclSocketCommStage {
enum ncclSocketCommState state;
struct ncclNetSocketCommStage {
enum ncclNetSocketCommState state;
uint8_t iteration;
struct ncclSocket* sock;
struct ncclSocketComm* comm;
struct ncclNetSocketComm* comm;
};
struct ncclSocketHandle {
struct ncclNetSocketHandle {
union ncclSocketAddress connectAddr;
uint64_t magic; // random number to help debugging
int nSocks;
int nThreads;
struct ncclSocketCommStage stage;
struct ncclNetSocketCommStage stage;
};
struct ncclSocketTask {
struct ncclNetSocketTask {
int op;
void* data;
int size;
@@ -152,41 +147,41 @@ struct ncclSocketTask {
ncclResult_t result;
};
struct ncclSocketRequest {
struct ncclNetSocketRequest {
int op;
void* data;
int size;
struct ncclSocket* ctrlSock;
int offset;
int used;
struct ncclSocketComm* comm;
struct ncclSocketTask* tasks[MAX_SOCKETS];
struct ncclNetSocketComm* comm;
struct ncclNetSocketTask* tasks[MAX_SOCKETS];
int nSubs;
};
struct ncclSocketTaskQueue {
struct ncclNetSocketTaskQueue {
int next;
int len;
struct ncclSocketTask* tasks;
struct ncclNetSocketTask* tasks;
};
struct ncclSocketThreadResources {
struct ncclSocketTaskQueue threadTaskQueue;
struct ncclNetSocketThreadResources {
struct ncclNetSocketTaskQueue threadTaskQueue;
int stop;
struct ncclSocketComm* comm;
struct ncclNetSocketComm* comm;
pthread_mutex_t threadLock;
pthread_cond_t threadCond;
};
struct ncclSocketListenComm {
struct ncclNetSocketListenComm {
struct ncclSocket sock;
struct ncclSocketCommStage stage;
struct ncclNetSocketCommStage stage;
int nSocks;
int nThreads;
int dev;
};
struct ncclSocketComm {
struct ncclNetSocketComm {
struct ncclSocket ctrlSock;
struct ncclSocket socks[MAX_SOCKETS];
int dev;
@@ -194,15 +189,15 @@ struct ncclSocketComm {
int nSocks;
int nThreads;
int nextSock;
struct ncclSocketRequest requests[MAX_REQUESTS];
struct ncclNetSocketRequest requests[MAX_REQUESTS];
pthread_t helperThread[MAX_THREADS];
struct ncclSocketThreadResources threadResources[MAX_THREADS];
struct ncclNetSocketThreadResources threadResources[MAX_THREADS];
};
void* persistentSocketThread(void *args_) {
struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_;
struct ncclSocketComm* comm = resource->comm;
struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue;
struct ncclNetSocketThreadResources* resource = (struct ncclNetSocketThreadResources*)args_;
struct ncclNetSocketComm* comm = resource->comm;
struct ncclNetSocketTaskQueue* myQueue = &resource->threadTaskQueue;
int nSocksPerThread = comm->nSocks / comm->nThreads;
while (1) {
int idle = 1;
@@ -212,7 +207,7 @@ void* persistentSocketThread(void *args_) {
do {
repeat = 0;
for (int j=0; j<nSocksPerThread; j++) {
struct ncclSocketTask* r = myQueue->tasks+i+j;
struct ncclNetSocketTask* r = myQueue->tasks+i+j;
if (r != NULL && r->used == 1 && r->offset < r->size) {
r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset);
if (r->result != ncclSuccess) {
@@ -236,7 +231,7 @@ void* persistentSocketThread(void *args_) {
}
}
ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
int nSocksPerThread = ncclParamSocketNsocksPerThread();
int nThreads = ncclParamSocketNthreads();
if (nThreads > MAX_THREADS) {
@@ -247,7 +242,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
// Auto-detection
int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
char vendorPath[PATH_MAX];
snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclSocketDevs[dev].devName);
snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetSocketDevs[dev].devName);
char* rPath = realpath(vendorPath, NULL);
int fd = open(rPath, O_RDONLY);
free(rPath);
@@ -285,36 +280,20 @@ end:
return ncclSuccess;
}
ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) {
NCCLCHECK(ncclCalloc(comm, 1));
(*comm)->sock.fd = -1;
return ncclSuccess;
}
ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
NCCLCHECK(ncclCalloc(comm, 1));
(*comm)->ctrlSock.fd = -1;
for (int i=0; i < MAX_SOCKETS; i++) {
(*comm)->socks[i].fd = -1;
}
(*comm)->nextSock = 0;
return ncclSuccess;
}
ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
if (dev < 0) { // data transfer socket is based on specified dev
ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) {
if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
return ncclInternalError;
}
struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
memset(handle, 0, sizeof(struct ncclSocketHandle));
static_assert(sizeof(struct ncclSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
struct ncclSocketListenComm* comm;
NCCLCHECK(ncclSocketNewListenComm(&comm));
NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr));
comm->sock.asyncFlag = 1;
struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle;
memset(handle, 0, sizeof(struct ncclNetSocketHandle));
static_assert(sizeof(struct ncclNetSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclNetSocketHandle size too large");
struct ncclNetSocketListenComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
handle->magic = NCCL_SOCKET_MAGIC;
NCCLCHECK(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1));
NCCLCHECK(ncclSocketListen(&comm->sock));
memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
NCCLCHECK(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
handle->nSocks = comm->nSocks;
handle->nThreads = comm->nThreads;
comm->dev = dev;
@@ -322,46 +301,41 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
return ncclSuccess;
}
ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
if (dev < 0) { // data transfer socket is based on specified dev
ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
return ncclInternalError;
}
enum ncclSocketState conState;
struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
struct ncclSocketCommStage* stage = &handle->stage;
struct ncclSocketComm* comm = stage->comm;
int ready;
struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle;
struct ncclNetSocketCommStage* stage = &handle->stage;
struct ncclNetSocketComm* comm = stage->comm;
uint8_t i = stage->iteration;
struct ncclSocket* sock = stage->sock;
*sendComm = NULL;
if (stage->state == ncclSocketCommStateConnect) goto socket_connect_check;
if (stage->state == ncclSocketCommStateSend) goto socket_send;
if (stage->state == ncclNetSocketCommStateConnect) goto socket_connect_check;
if (stage->state == ncclNetSocketCommStateSend) goto socket_send;
NCCLCHECK(ncclSocketNewComm(&comm));
NCCLCHECK(ncclCalloc(&comm, 1));
stage->comm = comm;
comm->nSocks = handle->nSocks;
comm->nThreads = handle->nThreads;
comm->dev = dev;
CUDACHECK(cudaGetDevice(&comm->cudaDev));
for (; i<comm->nSocks+1; i++) {
sock = i == comm->nSocks ? &comm->ctrlSock : comm->socks+i;
NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, NULL, 1));
sock = (i == comm->nSocks) ? &comm->ctrlSock : comm->socks+i;
NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetSocket, NULL, 1));
stage->sock = sock;
stage->state = ncclSocketCommStateConnect;
stage->state = ncclNetSocketCommStateConnect;
stage->iteration = i;
NCCLCHECK(ncclSocketConnect(sock));
socket_connect_check:
NCCLCHECK(ncclGetSocketState(sock, &conState));
if (conState == ncclSocketConnecting) {
/* expect user to call again */
return ncclSuccess;
} else if (conState == ncclSocketError) {
return ncclRemoteError;
}
stage->state = ncclSocketCommStateSend;
NCCLCHECK(ncclSocketReady(sock, &ready));
if (! ready) return ncclSuccess;
stage->state = ncclNetSocketCommStateSend;
socket_send:
int done = 0;
@@ -372,59 +346,63 @@ socket_send:
return ncclSuccess;
}
ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm;
struct ncclSocketCommStage* stage = &lComm->stage;
struct ncclSocketComm* rComm = stage->comm;
ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm) {
struct ncclNetSocketListenComm* lComm = (struct ncclNetSocketListenComm*)listenComm;
struct ncclNetSocketCommStage* stage = &lComm->stage;
struct ncclNetSocketComm* rComm = stage->comm;
uint8_t i = stage->iteration;
struct ncclSocket* sock = stage->sock;
int ready;
*recvComm = NULL;
if (stage->state == ncclSocketCommStateAccept) goto socket_accept;
if (stage->state == ncclSocketCommStateRecv) goto socket_recv;
if (stage->state == ncclNetSocketCommStateAccept) goto socket_accept_check;
if (stage->state == ncclNetSocketCommStateRecv) goto socket_recv;
NCCLCHECK(ncclSocketNewComm(&rComm));
NCCLCHECK(ncclCalloc(&rComm, 1));
stage->comm = rComm;
rComm->nSocks = lComm->nSocks;
rComm->nThreads = lComm->nThreads;
rComm->dev = lComm->dev;
CUDACHECK(cudaGetDevice(&rComm->cudaDev));
lComm->sock.asyncFlag = 1;
for (; i<rComm->nSocks+1; i++) {
uint8_t sendSockIdx;
ncclCalloc(&sock, 1);
NCCLCHECK(ncclSocketInit(sock, NULL, lComm->sock.abortFlag, 1));
stage->sock = sock;
stage->state = ncclSocketCommStateAccept;
stage->iteration = i;
socket_accept:
NCCLCHECK(ncclSocketAccept(sock, &lComm->sock));
if (sock->fd == -1) return ncclSuccess;
stage->state = ncclSocketCommStateRecv;
NCCLCHECK(ncclCalloc(&sock, 1));
NCCLCHECK(ncclSocketInit(sock));
stage->sock = sock;
stage->state = ncclNetSocketCommStateAccept;
stage->iteration = i;
NCCLCHECK(ncclSocketAccept(sock, &lComm->sock));
socket_accept_check:
NCCLCHECK(ncclSocketReady(sock, &ready));
if (!ready) return ncclSuccess;
stage->state = ncclNetSocketCommStateRecv;
socket_recv:
int done = 0;
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &sendSockIdx, sizeof(uint8_t), &done));
if (done == 0) return ncclSuccess;
if (sendSockIdx == rComm->nSocks) memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket));
else memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket));
if (sendSockIdx == rComm->nSocks)
memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket));
else
memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket));
free(sock);
}
*recvComm = rComm;
/* reset lComm state */
stage->state = ncclSocketCommStateStart;
stage->state = ncclNetSocketCommStateStart;
stage->iteration = 0;
stage->sock = NULL;
stage->comm = NULL;
return ncclSuccess;
}
ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketRequest** req) {
ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketRequest** req) {
for (int i=0; i<MAX_REQUESTS; i++) {
struct ncclSocketRequest* r = comm->requests+i;
struct ncclNetSocketRequest* r = comm->requests+i;
if (r->used == 0) {
r->op = op;
r->data = data;
@@ -441,10 +419,10 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat
return ncclInternalError;
}
ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) {
ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketTask** req) {
int tid = comm->nextSock % comm->nThreads;
struct ncclSocketThreadResources* res = comm->threadResources+tid;
struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
struct ncclNetSocketThreadResources* res = comm->threadResources+tid;
struct ncclNetSocketTaskQueue* queue = &res->threadTaskQueue;
// create helper threads and prepare per-thread task queue
if (queue->tasks == NULL) {
// each request can be divided up to nSocks tasks, and
@@ -459,12 +437,12 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
}
struct ncclSocketTask* r = queue->tasks+queue->next;
struct ncclNetSocketTask* r = queue->tasks+queue->next;
if (r->used == 0) {
r->op = op;
r->data = data;
r->size = size;
r->sock = comm->socks+comm->nextSock;
r->sock = comm->socks + comm->nextSock;
r->offset = 0;
r->result = ncclSuccess;
comm->nextSock = (comm->nextSock + 1) % comm->nSocks;
@@ -480,9 +458,9 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
return ncclInternalError;
}
ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
*done = 0;
struct ncclSocketRequest *r = (struct ncclSocketRequest*)request;
struct ncclNetSocketRequest *r = (struct ncclNetSocketRequest*)request;
if (r == NULL) {
WARN("NET/Socket : test called with NULL request");
return ncclInternalError;
@@ -500,9 +478,11 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
// Check size is less or equal to the size provided by the user
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
char line[SOCKET_NAME_MAXLEN+1];
union ncclSocketAddress addr;
ncclSocketGetAddr(r->ctrlSock, &addr);
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
ncclSocketToString(&r->ctrlSock->addr, line), data, r->size);
ncclSocketToString(&addr, line), data, r->size);
return ncclInvalidUsage;
}
r->size = data;
@@ -515,7 +495,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
while (chunkOffset < r->size) {
int chunkSize = std::min(taskSize, r->size-chunkOffset);
NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
NCCLCHECK(ncclNetSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
chunkOffset += chunkSize;
}
}
@@ -525,7 +505,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
if (r->nSubs > 0) {
int nCompleted = 0;
for (int i=0; i<r->nSubs; i++) {
struct ncclSocketTask* sub = r->tasks[i];
struct ncclNetSocketTask* sub = r->tasks[i];
if (sub->result != ncclSuccess) return sub->result;
if (sub->offset == sub->size) nCompleted++;
}
@@ -534,7 +514,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
*done = 1;
r->used = 0;
for (int i=0; i<r->nSubs; i++) {
struct ncclSocketTask* sub = r->tasks[i];
struct ncclNetSocketTask* sub = r->tasks[i];
sub->used = 0;
}
}
@@ -552,43 +532,45 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
return ncclSuccess;
}
ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
}
ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request));
ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm;
NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclNetSocketRequest**)request));
return ncclSuccess;
}
ncclResult_t ncclSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm;
if (n != 1) return ncclInternalError;
NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclSocketRequest**)request));
NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclNetSocketRequest**)request));
return ncclSuccess;
}
ncclResult_t ncclSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
// We don't support CUDA pointers, so we don't need a flush operation
return ncclInternalError;
}
ncclResult_t ncclSocketCloseListen(void* opaqueComm) {
struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm;
ncclResult_t ncclNetSocketCloseListen(void* opaqueComm) {
struct ncclNetSocketListenComm* comm = (struct ncclNetSocketListenComm*)opaqueComm;
if (comm) {
if (comm->sock.fd != -1) close(comm->sock.fd);
int ready;
NCCLCHECK(ncclSocketReady(&comm->sock, &ready));
if (ready) NCCLCHECK(ncclSocketClose(&comm->sock));
free(comm);
}
return ncclSuccess;
}
ncclResult_t ncclSocketClose(void* opaqueComm) {
struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm;
ncclResult_t ncclNetSocketClose(void* opaqueComm) {
struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)opaqueComm;
if (comm) {
for (int i=0; i<comm->nThreads; i++) {
struct ncclSocketThreadResources* res = comm->threadResources+i;
struct ncclNetSocketThreadResources* res = comm->threadResources+i;
if (comm->helperThread[i]) {
pthread_mutex_lock(&res->threadLock);
res->stop = 1;
@@ -598,9 +580,12 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
}
free(res->threadTaskQueue.tasks);
}
if (comm->ctrlSock.fd != -1) close(comm->ctrlSock.fd);
int ready;
NCCLCHECK(ncclSocketReady(&comm->ctrlSock, &ready));
if (ready) NCCLCHECK(ncclSocketClose(&comm->ctrlSock));
for (int i=0; i<comm->nSocks; i++) {
if (comm->socks[i].fd != -1) close(comm->socks[i].fd);
NCCLCHECK(ncclSocketReady(&comm->socks[i], &ready));
if (ready) NCCLCHECK(ncclSocketClose(&comm->socks[i]));
}
free(comm);
}
@@ -609,20 +594,20 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
ncclNet_t ncclNetSocket = {
"Socket",
ncclSocketInit,
ncclSocketDevices,
ncclSocketGetProperties,
ncclSocketListen,
ncclSocketConnect,
ncclSocketAccept,
ncclSocketRegMr,
ncclNetSocketInit,
ncclNetSocketDevices,
ncclNetSocketGetProperties,
ncclNetSocketListen,
ncclNetSocketConnect,
ncclNetSocketAccept,
ncclNetSocketRegMr,
NULL, // No DMA-BUF support
ncclSocketDeregMr,
ncclSocketIsend,
ncclSocketIrecv,
ncclSocketIflush,
ncclSocketTest,
ncclSocketClose,
ncclSocketClose,
ncclSocketCloseListen
ncclNetSocketDeregMr,
ncclNetSocketIsend,
ncclNetSocketIrecv,
ncclNetSocketIflush,
ncclNetSocketTest,
ncclNetSocketClose,
ncclNetSocketClose,
ncclNetSocketCloseListen
};
+7 -7
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -37,6 +37,7 @@ struct p2pProxyInfo {
struct p2pShm* devShm;
char shmName[7];
int shmSize;
ncclShmHandle_t handle;
// Intermediate step for sender
struct ncclRecvMem* ceRecvMem;
@@ -67,6 +68,7 @@ struct p2pRecvResources {
struct p2pShm* shm;
struct p2pShm* devShm;
int shmSize;
ncclShmHandle_t handle;
};
#include <sys/types.h>
@@ -379,9 +381,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
resources->shmSize = info->shmSize;
NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, 0));
// Remove the file to ensure proper clean-up
NCCLCHECK(ncclShmUnlink(shmPath));
NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle));
recv->conn.tail = &resources->devShm->recvMem.tail;
recv->conn.head = &resources->devShm->sendMem.head;
@@ -424,7 +424,7 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
if (useMemcpy) {
NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize));
NCCLCHECK(ncclShmClose(resources->handle));
}
free(resources);
}
@@ -442,7 +442,7 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
char shmPath[PATH_MAX];
shmPath[0] = '\0';
proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1));
NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
@@ -505,7 +505,7 @@ static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, str
if (useMemcpy) {
struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
if (proxyInfo) {
NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize));
NCCLCHECK(ncclShmClose(proxyInfo->handle));
NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
CUDACHECK(cudaFree(proxyInfo->ceDevBuff));
CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
+12 -11
Просмотреть файл
@@ -17,18 +17,22 @@ struct shmSendResources {
int remShmSize;
struct ncclRecvMem* remHostMem;
struct ncclRecvMem* devRemHostMem;
ncclShmHandle_t remHandle;
int shmSize;
struct ncclSendMem* hostMem;
struct ncclSendMem* devHostMem;
ncclShmHandle_t hostHandle;
};
struct shmRecvResources {
int remShmSize;
struct ncclSendMem* remHostMem;
struct ncclSendMem* devRemHostMem;
ncclShmHandle_t remHandle;
int shmSize;
struct ncclRecvMem* hostMem;
struct ncclRecvMem* devHostMem;
ncclShmHandle_t hostHandle;
};
#define SHM_SEND_SIDE 1
@@ -84,7 +88,7 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += send->comm->buffSizes[p];
}
info->shmSize = resources->shmSize = shmSize;
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
@@ -107,7 +111,7 @@ static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
}
info->shmSize = resources->shmSize = shmSize;
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
@@ -137,9 +141,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
resources->remShmSize = info->shmSize;
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
// Remove the file to ensure proper clean-up
NCCLCHECK(ncclShmUnlink(shmPath));
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));
char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -172,8 +174,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
resources->remShmSize = info->shmSize;
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
NCCLCHECK(ncclShmUnlink(shmPath));
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));
char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -196,8 +197,8 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
static ncclResult_t shmSendFree(struct ncclConnector* send) {
struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
if (resources) {
NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
NCCLCHECK(ncclShmClose(resources->hostHandle));
NCCLCHECK(ncclShmClose(resources->remHandle));
free(resources);
}
return ncclSuccess;
@@ -206,8 +207,8 @@ static ncclResult_t shmSendFree(struct ncclConnector* send) {
static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
if (resources) {
NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
NCCLCHECK(ncclShmClose(resources->hostHandle));
NCCLCHECK(ncclShmClose(resources->remHandle));
free(resources);
}
return ncclSuccess;
+4 -5
Просмотреть файл
@@ -6,7 +6,7 @@ endif
HIPCC = $(HIP_PATH)/bin/hipcc
EXE = topo_expl
CXXFLAGS = -g -O3 -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE
CXXFLAGS = -g -O3 -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL
files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc
@@ -17,10 +17,9 @@ $(EXE): $(files)
$(HIPCC) $(CXXFLAGS) $^ -o $@
hipify:
mkdir -p hipify_rccl/include
mkdir -p hipify_rccl/graph
cp ../../src/include/*.h hipify_rccl/include/
cp ../../src/graph/* hipify_rccl/graph/
mkdir -p hipify_rccl
cp -a ../../src/include/ hipify_rccl/
cp -a ../../src/graph/ hipify_rccl/
hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h
hipify-perl -inplace -quiet-warnings hipify_rccl/graph/*
+9 -1
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -45,4 +45,12 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph);
#define TIME_START(index)
#define TIME_STOP(index)
#define TIME_CANCEL(index)
#define TIME_PRINT(name)
#endif
+267 -267
Просмотреть файл
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -245,7 +245,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
struct ncclChannel* channel = &comm->channels[channelId];
uint32_t mask = 1 << channelId;
uint64_t mask = 1UL << channel->id;
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
@@ -270,72 +270,87 @@ void dumpData(struct ncclConnect* data, int ndata) {
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
ncclResult_t ret = ncclSuccess;
int highestType = TRANSPORT_P2P; // track highest transport type
//hipStream_t transportSetupStream;
//CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
struct ncclConnect data[2*MAXCHANNELS];
//NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
for (int i=1; i<comm->nRanks; i++) {
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
uint64_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
uint64_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
struct ncclConnect* recvData = data;
int sendChannels = 0, recvChannels = 0;
int type;
TIME_START(0);
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1<<c)) {
NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type));
if (recvMask & (1UL<<c)) {
NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
if (type > highestType) highestType = type;
}
}
TIME_STOP(0);
TIME_START(1);
struct ncclConnect* sendData = recvData+recvChannels;
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1<<c)) {
NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type));
if (sendMask & (1UL<<c)) {
NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
if (type > highestType) highestType = type;
}
}
TIME_STOP(1);
TIME_START(2);
if (sendPeer == recvPeer) {
if (recvChannels+sendChannels) {
//NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
//NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
//NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
//NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
sendData = data;
recvData = data+sendChannels;
}
} else {
//if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
//if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
//if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
//if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
//if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
//if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
//if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
//if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
}
TIME_STOP(2);
TIME_START(3);
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1<<c)) {
if (sendMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
//NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
//NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn), ret, fail);
conn->connected = 1;
//CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
//CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
//CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
}
}
TIME_STOP(3);
TIME_START(4);
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1<<c)) {
if (recvMask & (1UL<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
//NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
//NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn), ret, fail);
conn->connected = 1;
//CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
//CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
}
}
comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0;
TIME_STOP(4);
comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL;
}
//CUDACHECK(hipStreamSynchronize(transportSetupStream));
//CUDACHECK(hipStreamDestroy(transportSetupStream));
if (highestTransportType != NULL) *highestTransportType = highestType;
return ncclSuccess;
TIME_PRINT("P2P Setup/Connect");
exit:
//NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->deviceStream, &comm->hostStream));
//NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->hostStream));
return ret;
fail:
goto exit;
}
extern struct ncclTransport collNetTransport;
@@ -466,32 +481,120 @@ RCCL_PARAM(PivotAlltoallEnable, "PIVOT_ALLTOALL_ENABLE", 0);
NCCL_PARAM(AllocP2pNetLLBuffers, "NCCL_ALLOC_P2P_NET_LL_BUFFERS", 0);
RCCL_PARAM(LL128ForceEnable, "LL128_FORCE_ENABLE", 0);
static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collNetGraph) {
ncclResult_t ret = ncclSuccess;
int* heads = NULL;
int rank = comm->rank;
int collNetSetupFail = 0;
int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_P2P };
// Find all head ranks
int nHeads = collNetGraph->nChannels;
int highestTransportType0, highestTransportType1;
char line[1024];
NCCLCHECKGOTO(ncclCalloc(&heads, nHeads), ret, fail);
// Head GPU index is always 0
for (int c = 0; c < nHeads; c++) {
heads[c] = collNetGraph->intra[c * comm->localRanks + 0];
}
for (int c = 0; c < comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels + c;
for (int h = 0; h < nHeads; h++) {
const int head = heads[h];
collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
if (!collNetSetupFail) collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
}
// Verify CollNet setup across ranks after trying the first channel
if (c == 0) {
NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
}
}
// Verify CollNet setup across ranks after trying all channels
NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
line[0] = '\0';
for (int c = 0; c < comm->nChannels; c++) {
struct ncclTree* chain = &comm->channels[c].collnetChain;
snprintf(line + strlen(line), 1023 - strlen(line), " [%d] %d->%d->%d",
c, chain->down[0], rank, chain->up);
}
line[1023] = '\0';
INFO(NCCL_INIT, "Collnet Chains %s", line);
// Connect Collnet + chain
for (int c = 0; c < comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels + c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, fail);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0), ret, fail);
for (int c = 0; c < comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels + c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, fail);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1), ret, fail);
INFO(NCCL_INIT, "Connected collnet + chain");
// Connect intra-node CollNet + Direct
for (int c = 0; c < comm->nChannels; c++) {
struct ncclChannel* channelRecv = comm->channels + c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0, &highestTransportType0), ret, fail);
for (int c = 0; c < comm->nChannels; c++) {
struct ncclChannel* channelSend = comm->channels + c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1, &highestTransportType1), ret, fail);
#if 0
// Exchange highest intra-node transport type among ranks
// because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail);
for (int i = 0; i < comm->localRanks; i++) {
if (highestTypes[i] > comm->intraHighestTransportType)
comm->intraHighestTransportType = highestTypes[i];
}
#endif
INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
exit:
free(heads);
return ret;
fail:
ncclTransportCollNetFree(comm);
comm->collNetSupport = 0;
goto exit;
}
ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
// We use 2 AllGathers
// 1. { peerInfo, comm, compCap}
// 2. { nChannels, graphInfo, topoRanks }
ncclResult_t ret = ncclSuccess;
int rank = comm->rank;
int nranks = comm->nRanks;
//uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
cpu_set_t affinitySave;
//TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
// [RCCL] Collect the PID of the root
int rootPid;
//NCCLCHECK(bootstrapInit(commId, comm));
// [/RCCL]
//NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)commId, comm), ret, fail);
// AllGather1 - begin
//NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root
//NCCLCHECK(fillInfo(comm, comm->peerInfo+rank, comm->rank));
//NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)));
//NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
//NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, commHash), ret, fail);
//NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
//If virtualId == -1 multiRank support has not been requested by user, using original interface
if (comm->virtualId == -1) {
for (int i = 0; i < nranks; i++) {
if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
return ncclInvalidUsage;
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
ret = ncclInvalidUsage;
goto fail;
}
}
}
@@ -512,8 +615,43 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
}
// AllGather1 - end
do {
// Compute intra-process ranks
int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
for (int i = 0; i < nranks; i++) {
if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
&& (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
// Rank is in same process
if (intraProcRanks == 0) intraProcRank0 = i;
if (i == rank) intraProcRank = intraProcRanks;
intraProcRanks++;
if (intraProcRank0 == rank && rank != i) {
comm->peerInfo[i].comm->intraNext = comm->intraNext;
comm->intraNext = comm->peerInfo[i].comm;
}
}
}
TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
intraProcRank, intraProcRanks, intraProcRank0);
ret = ncclInternalError;
goto fail;
}
struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
assert(intraProcRank==0 ? comm==comm0 : true);
comm->intraComm0 = comm0;
comm->intraRank = intraProcRank;
comm->intraRanks = intraProcRanks;
comm->intraBarrierPhase = 0;
comm->intraBarrierCounter = 0;
comm->intraBarrierGate = 0;
} while(0);
// Topo detection / System graph creation
//NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
//NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
// save nRanks to ncclTopoSystem as indicator of multi-node
comm->topo->nRanks = comm->nRanks;
// init netGdrLevel
@@ -524,57 +662,54 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
// LL128
comm->topo->ll128Enabled = false;
// Compute paths between GPUs and NICs
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
// Remove inaccessible GPUs and unused NICs
NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
NCCLCHECKGOTO(ncclTopoTrimSystem(comm->topo, comm), ret, fail);
// Recompute paths after trimming
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
// Init search
NCCLCHECK(ncclTopoSearchInit(comm->topo));
NCCLCHECKGOTO(ncclTopoSearchInit(comm->topo), ret, fail);
// Print final topology
NCCLCHECK(ncclTopoPrint(comm->topo));
NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);
// Set Affinity to a CPU local the our GPU, so that all memory we allocate
// on the host is local.
//NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity));
//cpu_set_t affinitySave;
// if (CPU_COUNT(&comm->cpuAffinity)) {
//sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
//sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
//NCCLCHECKGOTO(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity), ret, fail);
//if (CPU_COUNT(&comm->cpuAffinity)) {
// sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
// sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
//}
ncclResult_t ret;
// Launch proxy service thread
//NCCLCHECK(ncclProxyCreate(comm));
//NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
// Get rings and trees
//struct ncclTopoGraph ringGraph;
ringGraph.id = 0;
ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
ringGraph.collNet = 0;
ringGraph.minChannels = 1;
ringGraph.maxChannels = MAXCHANNELS/2;
NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph));
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph));
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &ringGraph), ret, fail);
//struct ncclTopoGraph treeGraph;
treeGraph.id = 1;
treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
treeGraph.collNet = 0;
treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels;
treeGraph.maxChannels = ringGraph.nChannels;
NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &treeGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &treeGraph), ret, fail);
//struct ncclTopoGraph collNetGraph;
collNetGraph.id = 2;
collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
collNetGraph.collNet = 1;
collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);
bool allXgmi = true, hasPeerAccess = true;
bool allXgmi, hasPeerAccess;
allXgmi = true;
hasPeerAccess = true;
// Check that all the GPUs have peer access to one another and are XGMI connected
for (int i = 0; i < nranks && hasPeerAccess; i++) {
int cudaDev1 = comm->peerInfo[i].cudaDev;
@@ -599,7 +734,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
if (comm->rank == ncclParamGraphDumpFileRank()) {
struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 3, graphs), ret, fail);
}
// Determine local CollNet support before all-gather
@@ -623,31 +758,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
INFO(NCCL_INIT, "RCCL force disabled same node P2P over network");
}
// AllGather3 - begin
#if 0
struct ncclGraphInfo {
int pattern;
int nChannels;
int sameChannels;
float bwIntra;
float bwInter;
int typeIntra;
int typeInter;
};
struct {
int netDev;
int collNetSupport;
int nc;
struct ncclGraphInfo tree;
struct ncclGraphInfo ring;
struct ncclGraphInfo collNet;
struct ncclTopoRanks topoRanks;
bool pivotA2AEnabled;
bool ll128Enabled;
} *allGather3Data;
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
#endif
//NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
int idx;
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
allGather3Data[rank].nc = 2;
@@ -669,7 +780,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
if (ringGraph.nChannels > MAXCHANNELS/2)
allGather3Data[rank].nc = 1;
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev));
NCCLCHECKGOTO(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev), ret, fail);
allGather3Data[rank].tree.pattern = treeGraph.pattern;
allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
@@ -698,8 +809,9 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count)
? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels;
NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks));
return ncclSuccess;
NCCLCHECKGOTO(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks), ret, fail);
fail:
return ret;
}
ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
@@ -707,13 +819,20 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
int rank = comm->rank;
int nranks = comm->nRanks;
ncclResult_t ret;
//NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
int nChannelsOrig;
struct ncclTopoRanks** allTopoRanks = NULL;
int *nodesFirstRank = NULL, *nodesTreePatterns = NULL;
int *rings = NULL;
int* nvbPeers = NULL;
struct ncclProxyConnector proxyConn;
int* pxnPeers = NULL;
//NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
// Determine nNodes, firstRanks, ...
int *nodesFirstRank, *nodesTreePatterns;
NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks));
NCCLCHECK(ncclCalloc(&comm->rankToNode, comm->nRanks));
NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail);
for (int r=0; r<nranks; r++) {
int node;
int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
@@ -727,8 +846,8 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
comm->rankToNode[r] = node;
}
// Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
NCCLCHECK(ncclCalloc(&comm->nodeRanks, comm->nNodes));
NCCLCHECK(ncclCalloc(&comm->rankToLocalRank, comm->nRanks));
NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail);
for (int r=0; r<comm->nRanks; r++) {
int node = comm->rankToNode[r];
comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
@@ -736,7 +855,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
}
// Allocate ranks arrays for each node
for (int n=0; n<comm->nNodes; n++) {
NCCLCHECK(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks));
NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail);
comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
comm->nodeRanks[n].localRanks = 0;
}
@@ -756,13 +875,14 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
comm->localRank, comm->localRanks, comm->localRankToRank[0]);
return ncclInternalError;
ret = ncclInternalError;
goto fail;
}
int nChannelsOrig = comm->nChannels;
struct ncclTopoRanks** allTopoRanks;
NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
int nc = allGather3Data[0].nc;
nChannelsOrig = comm->nChannels;
NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
int nc;
nc = allGather3Data[0].nc;
for (int i=0; i<nranks; i++) {
comm->peerInfo[i].netDev = allGather3Data[i].netDev;
allTopoRanks[i] = &allGather3Data[i].topoRanks;
@@ -816,19 +936,14 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
}
}
int *rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc));
NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc), ret, fail);
if (comm->topo->pivotA2ANumBiRings == 3) {
NCCLCHECK(ncclTreeBasePostset(comm, &treeGraph));
NCCLCHECK(ncclBinaryTreePostset(comm, &treeGraph));
}
free(allTopoRanks);
free(nodesTreePatterns);
free(nodesFirstRank);
//free(allGather3Data);
// AllGather3 - end
@@ -854,125 +969,41 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
INFO(NCCL_INIT, "BinTrees%s", binline);
}
//NCCLCHECK(computeBuffSizes(comm));
//NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);
// Connect with prev/next for each ring
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, fail);
if (ringGraph.nIntraChannels && rcclParamP2pNetDisable() == 0) {
comm->useIntraNet = 1;
// Connect NET for intranode use
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, fail);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, fail);
}
free(rings);
INFO(NCCL_INIT, "Connected all rings");
// Connect Trees
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
// RCCL: need to connect binTree as well
if (comm->topo->pivotA2ANumBiRings == 3) {
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->binTree.down, 1, &channel->binTree.up, 0), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->binTree.up, NCCL_MAX_TREE_ARITY, channel->binTree.down, 0), ret, affinity_restore);
}
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail);
INFO(NCCL_INIT, "Connected all trees");
// Check if we can setup CollNet
if (comm->collNetSupport > 0) {
int collNetSetupFail = 0;
int highestTypes[NCCL_MAX_LOCAL_RANKS] = {TRANSPORT_P2P};
// Find all head ranks
int nHeads = collNetGraph.nChannels;
int *heads;
NCCLCHECK(ncclCalloc(&heads, nHeads));
// Head GPU index is always 0
for (int c=0; c<nHeads; c++) {
heads[c] = collNetGraph.intra[c*comm->localRanks+0];
}
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
for (int h=0; h<nHeads; h++) {
const int head = heads[h];
collNetSetupFail = ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetRecv);
collNetSetupFail += ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend);
}
// Verify CollNet setup across ranks after trying the first channel
if (c == 0) {
NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
}
}
// Verify CollNet setup across ranks after trying all channels
NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
if (comm->collNetSupport > 0) collNetTrySetup(comm, &collNetGraph);
char line[1024];
line[0]='\0';
for (int c=0; c<comm->nChannels; c++) {
struct ncclTree* chain = &comm->channels[c].collnetChain;
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d",
c, chain->down[0], rank, chain->up);
}
line[1023] = '\0';
INFO(NCCL_INIT, "Collnet Chains %s", line);
// Connect Collnet + chain
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, collnet_cleanup);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup);
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, collnet_cleanup);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
INFO(NCCL_INIT, "Connected collnet + chain");
// Connect intra-node CollNet + Direct
int highestTransportType0, highestTransportType1;
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channelRecv = comm->channels+c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, collnet_cleanup);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup);
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channelSend = comm->channels+c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, collnet_cleanup);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup);
// Exchange highest intra-node transport type among ranks
// because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
//NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)));
for (int i=0; i<comm->localRanks; i++) {
if (highestTypes[i] > comm->intraHighestTransportType)
comm->intraHighestTransportType = highestTypes[i];
}
INFO(NCCL_INIT, "rank %d Connected CollNet comm %p nRanks %02d", rank, comm, comm->nRanks);
collnet_cleanup:
free(heads);
if (ret != ncclSuccess) {
NCCLCHECK(ncclTransportCollNetFree(comm));
comm->collNetSupport = 0;
ret = ncclSuccess;
}
}
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
// Compute time models for algorithm and protocol combinations
@@ -983,11 +1014,11 @@ collnet_cleanup:
minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
}
NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
NCCLCHECKGOTO(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph), ret, fail);
} while(0);
// Compute nChannels per peer for p2p
NCCLCHECK(ncclTopoComputeP2pChannels(comm));
NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
#if 0
do { // Setup p2p structures in comm->tasks
struct ncclTasks* tasks = &comm->tasks;
@@ -1038,79 +1069,40 @@ collnet_cleanup:
if (ncclParamNvbPreconnect()) {
// Connect p2p when using NVB path
int nvbNpeers;
int* nvbPeers;
NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail);
for (int r=0; r<nvbNpeers; r++) {
int peer = nvbPeers[r];
int channelId;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId));
NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId), ret, fail);
if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
comm->connectSend[peer] |= (1<<channelId);
comm->connectSend[peer] |= (1UL<<channelId);
}
}
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId));
NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId), ret, fail);
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
comm->connectRecv[peer] |= (1<<channelId);
comm->connectRecv[peer] |= (1UL<<channelId);
}
}
}
NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
free(nvbPeers);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
}
#endif
// Connect to local net proxy
struct ncclProxyConnector proxyConn;
//NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn));
//NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
//NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
//NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
// Then to remote ones when using PXN
if (ncclPxnDisable(comm) == 0) {
int nranks;
int* pxnPeers;
NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks));
NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
for (int r=0; r<nranks; r++) {
//NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn));
// NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
//NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
//NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
}
free(pxnPeers);
}
do {
// Compute intra-process ranks
int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
for (int i = 0; i < nranks; i++) {
if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
&& (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
// Rank is in same process
if (intraProcRanks == 0) intraProcRank0 = i;
if (i == rank) intraProcRank = intraProcRanks;
intraProcRanks++;
if (intraProcRank0 == rank && rank != i) {
comm->peerInfo[i].comm->intraNext = comm->intraNext;
comm->intraNext = comm->peerInfo[i].comm;
}
}
}
TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
intraProcRank, intraProcRanks, intraProcRank0);
return ncclInternalError;
}
struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
assert(intraProcRank==0 ? comm==comm0 : true);
comm->intraComm0 = comm0;
comm->intraRefs = intraProcRank==0 ? intraProcRanks : 0;
comm->intraRank = intraProcRank;
comm->intraRanks = intraProcRanks;
comm->intraBarrierPhase = 0;
comm->intraBarrierCounter = 0;
comm->intraBarrierGate = 0;
} while(0);
#if 0
if (comm->intraRank == 0) { // Load ncclParamLaunchMode
@@ -1129,23 +1121,31 @@ collnet_cleanup:
}
}
NCCLCHECKGOTO(devCommSetup(comm), ret, affinity_restore);
// Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
// launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
/* Local intra-node barrier */
//NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
// Unlink proxy shm to make sure it will be properly cleaned up.
NCCLCHECK(ncclProxyShmUnlink(comm));
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
#endif
// We should have allocated all buffers, collective fifos, ... we can
// restore the affinity.
affinity_restore:
//if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
if (ret != ncclSuccess) return ret;
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
return ncclSuccess;
exit:
//if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
// Unlink proxy shm to make sure it will be properly cleaned up.
//ncclProxyShmUnlink(comm);
free(allTopoRanks);
free(nodesTreePatterns);
free(nodesFirstRank);
//free(allGather3Data);
free(rings);
free(nvbPeers);
free(pxnPeers);
return ret;
fail:
goto exit;
}
ncclResult_t rocm_smi_init() {