diff --git a/CMakeLists.txt b/CMakeLists.txt index 945641c65b..89bd866ad5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,18 +173,22 @@ set(CC_SOURCES src/collectives/all_to_all_api.cc src/collectives/all_to_allv_api.cc src/channel.cc - src/clique/CliqueManager.cc # RCCL - src/clique/HandleCache.cc # RCCL - src/clique/HandleShm.cc # RCCL - src/clique/Hash.cc # RCCL - src/clique/MsgQueue.cc # RCCL - src/clique/ShmObject.cc # RCCL + #src/clique/CliqueManager.cc # RCCL + #src/clique/HandleCache.cc # RCCL + #src/clique/HandleShm.cc # RCCL + #src/clique/Hash.cc # RCCL + #src/clique/MsgQueue.cc # RCCL + #src/clique/ShmObject.cc # RCCL src/misc/argcheck.cc src/misc/nvmlwrap_stub.cc src/misc/utils.cc src/misc/ibvwrap.cc src/misc/nvmlwrap_stub.cc src/misc/rocm_smi_wrap.cc + src/misc/profiler.cc + src/misc/shmutils.cc + src/misc/socket.cc + src/misc/param.cc src/transport/coll_net.cc src/transport/net.cc src/transport/net_ib.cc @@ -196,6 +200,7 @@ set(CC_SOURCES src/group.cc src/bootstrap.cc src/proxy.cc + src/net.cc src/enqueue.cc) foreach(filename ${CC_SOURCES}) @@ -212,11 +217,6 @@ if(PROFILE) add_definitions(-DENABLE_PROFILING) endif() -if(TIMING_PROFILE) - add_definitions(-DENABLE_PROFILING) - add_definitions(-DENABLE_TIMING_PROFILE) -endif() - set(COLLTRACE 1 CACHE BOOL "Collective Trace Option") if(COLLTRACE) add_definitions(-DENABLE_COLLTRACE) diff --git a/ext-net/google-fastsocket/Makefile b/ext-net/google-fastsocket/Makefile index e40e3053ad..8dfa8ca4ae 100644 --- a/ext-net/google-fastsocket/Makefile +++ b/ext-net/google-fastsocket/Makefile @@ -1,10 +1,10 @@ CUDA_HOME?=/usr/local/cuda INC:=-I$(CUDA_HOME)/include -PLUGIN_SO:=../../build/libnccl-net.so +PLUGIN_SO:=libnccl-net.so default: $(PLUGIN_SO) -$(PLUGIN_SO): nccl-fastsocket/net_fastsocket.cc nccl-fastsocket/compat.cc +$(PLUGIN_SO): nccl-fastsocket/net_fastsocket.cc nccl-fastsocket/compat.cc nccl-fastsocket/utilities.cc $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ nccl-fastsocket/%.cc: diff --git a/makefiles/common.mk b/makefiles/common.mk index 64f8d2dc6e..1a1c2b66f8 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -23,7 +23,6 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) #$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR}) - # You should define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \ @@ -39,7 +38,7 @@ CUDA11_PTX = -gencode=arch=compute_80,code=compute_80 # Include Ampere support if we're using CUDA11 or above ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) - NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) $(CUDA11_GENCODE) $(CUDA11_PTX) + NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX) # Include Volta support if we're using CUDA9 or above else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0) NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) diff --git a/makefiles/version.mk b/makefiles/version.mk index 22bddcee2e..7c9bf0f136 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 11 -NCCL_PATCH := 4 +NCCL_MINOR := 12 +NCCL_PATCH := 10 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index a548840b3d..82e21a04ea 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -9,8 +9,8 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h -LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc \ - misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc \ +LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \ + misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc \ transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \ collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc @@ -74,14 +74,14 @@ $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB) ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) +null := +space := $(null) # +comma := , + $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB) @printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@ mkdir -p $(LIBDIR) - $(eval TMP := $(shell mktemp -d)) - cp $(LIBOBJ) $(TMP) - cd $(TMP) && ar x $(DEVICELIB) && cd - - ar cr $@ $(LIBOBJ) $(TMP)/*.o - rm -Rf $(TMP) + printf "create $@\naddlib $(DEVICELIB)\naddmod $(subst $(space),$(comma),$(strip $(LIBOBJ)))\nsave\nend" | ar -M $(PKGDIR)/nccl.pc : nccl.pc.in mkdir -p $(PKGDIR) @@ -121,7 +121,7 @@ clean : $(MAKE) -C collectives/device clean rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR} -install : lib +install : build mkdir -p $(PREFIX)/lib mkdir -p $(PREFIX)/lib/pkgconfig mkdir -p $(PREFIX)/include diff --git a/src/bootstrap.cc b/src/bootstrap.cc index b38f8be0bb..daaa8cdbb7 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -1,6 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,18 +9,13 @@ #include "utils.h" #include "bootstrap.h" #include "net.h" -#include "socket.h" #include #include -// [RCCL] -#include "clique/CliqueManager.h" -#include "clique/CliqueShmNames.h" -#include "clique/Hash.h" -// [/RCCL] +#include "proxy.h" /* Init functions */ static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1]; -static union socketAddress bootstrapNetIfAddr; +static union ncclSocketAddress bootstrapNetIfAddr; static int bootstrapNetInitDone = 0; pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER; @@ -31,17 +25,17 @@ ncclResult_t bootstrapNetInit() { if (bootstrapNetInitDone == 0) { char* env = getenv("NCCL_COMM_ID"); if (env) { - union socketAddress remoteAddr; - if (GetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) { + union ncclSocketAddress remoteAddr; + if (ncclGetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) { WARN("Invalid NCCL_COMM_ID, please use format: : or []: or :"); return ncclInvalidArgument; } - if (findInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { + if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { WARN("NET/Socket : No usable listening interface found"); return ncclSystemError; } } else { - int nIfs = findInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); + int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); if (nIfs <= 0) { WARN("Bootstrap : no socket interface found"); return ncclInternalError; @@ -49,7 +43,7 @@ ncclResult_t bootstrapNetInit() { } char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2]; sprintf(line, " %s:", bootstrapNetIfName); - socketToString(&bootstrapNetIfAddr, line+strlen(line)); + ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line)); INFO(NCCL_INIT, "Bootstrap : Using%s", line); bootstrapNetInitDone = 1; } @@ -61,35 +55,28 @@ ncclResult_t bootstrapNetInit() { /* Socket Interface Selection type */ enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 }; -static ncclResult_t bootstrapNetAccept(int listenFd, int* recvFd, union socketAddress *addr) { - struct sockaddr *saddr = &addr->sa; - socklen_t socklen = sizeof(union socketAddress); - SYSCHECKVAL(accept(listenFd, saddr, &socklen), "accept", *recvFd); - return ncclSuccess; -} - // Additional sync functions -static ncclResult_t bootstrapNetSend(int fd, union socketAddress *addr, void* data, int size) { - NCCLCHECK(socketSend(fd, addr, &size, sizeof(int))); - NCCLCHECK(socketSend(fd, addr, data, size)); +static ncclResult_t bootstrapNetSend(struct ncclSocket* sock, void* data, int size) { + NCCLCHECK(ncclSocketSend(sock, &size, sizeof(int))); + NCCLCHECK(ncclSocketSend(sock, data, size)); return ncclSuccess; } -static ncclResult_t bootstrapNetRecv(int fd, union socketAddress *addr, void* data, int size) { +static ncclResult_t bootstrapNetRecv(struct ncclSocket* sock, void* data, int size) { int recvSize; - NCCLCHECK(socketRecv(fd, addr, &recvSize, sizeof(int))); + NCCLCHECK(ncclSocketRecv(sock, &recvSize, sizeof(int))); if (recvSize > size) { WARN("Message truncated : received %d bytes instead of %d", recvSize, size); return ncclInternalError; } - NCCLCHECK(socketRecv(fd, addr, data, std::min(recvSize, size))); + NCCLCHECK(ncclSocketRecv(sock, data, std::min(recvSize, size))); return ncclSuccess; } struct extInfo { int rank; int nranks; - union socketAddress extAddressListenRoot; - union socketAddress extAddressListen; + union ncclSocketAddress extAddressListenRoot; + union ncclSocketAddress extAddressListen; }; #include @@ -102,33 +89,25 @@ static ncclResult_t setFilesLimit() { return ncclSuccess; } -static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to include hash argument) - - // [RCCL] Unpack bootstrapRootStruct - struct bootstrapRootStruct rootStruct = *(struct bootstrapRootStruct*)bootstrapRootStruct; - int listenFd = rootStruct.listenFd; - unsigned long hash = rootStruct.hash; - int pid = getpid(); // sharing PID to other ranks for creating shared memory files for CliqueManager - free(bootstrapRootStruct); - // [/RCCL] - +static void *bootstrapRoot(void* args) { + struct ncclSocket* listenSock = (struct ncclSocket*)args; ncclResult_t res = ncclSuccess; int nranks = 0, c = 0; struct extInfo info; - union socketAddress *rankAddresses = NULL; - union socketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange - union socketAddress *zero = NULL; + union ncclSocketAddress *rankAddresses = NULL; + union ncclSocketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange + union ncclSocketAddress *zero = NULL; NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out); setFilesLimit(); TRACE(NCCL_INIT, "BEGIN"); /* Receive addresses from all ranks */ do { - int tmpFd; - union socketAddress addr; - NCCLCHECKGOTO(bootstrapNetAccept(listenFd, &tmpFd, &addr), res, out); - NCCLCHECKGOTO(bootstrapNetRecv(tmpFd, &addr, &info, sizeof(info)), res, out); - close(tmpFd); + struct ncclSocket sock; + sock.abortFlag = NULL; + NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out); + NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out); + close(sock.fd); if (c == 0) { nranks = info.nranks; @@ -141,40 +120,35 @@ static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to in goto out; } - if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union socketAddress)) != 0) { + if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union ncclSocketAddress)) != 0) { WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks); goto out; } // Save the connection handle for that rank - memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union socketAddress)); - memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union socketAddress)); + memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union ncclSocketAddress)); + memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union ncclSocketAddress)); ++c; TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks); } while (c < nranks); TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks); - { // [RCCL] Initialize message queues / shared memory files - NCCLCHECKGOTO(CliqueManager::BootstrapRootInit(pid, hash), res, out); - } // [/RCCL] - // Send the connect handle for the next rank in the AllGather ring for (int r=0; rfd); + free(listenSock); if (rankAddresses) free(rankAddresses); if (rankAddressesRoot) free(rankAddressesRoot); if (zero) free(zero); @@ -184,36 +158,32 @@ out: } ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) { - union socketAddress* connectAddr = (union socketAddress*) id; - int listenFd; - NCCLCHECK(createListenSocket(&listenFd, connectAddr)); + struct ncclSocket* listenSock; + NCCLCHECK(ncclCalloc(&listenSock, 1)); + memcpy(&listenSock->addr, id, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketListen(listenSock)); + memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress)); pthread_t thread; - - // [RCCL] Use the ncclUniqueId to get a hash for bootstrap - struct bootstrapRootStruct* rootStruct = new struct bootstrapRootStruct; - rootStruct->hash = djb2Hash(id->internal); - rootStruct->listenFd = listenFd; - pthread_create(&thread, NULL, bootstrapRoot, (void *)rootStruct); - pthread_detach(thread); // [RCCL] Adding detach to properly clean up bootstrapRoot thread - // [/RCCL] - + pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock); + pthread_detach(thread); // will not be pthread_join()'d + ncclSetThreadName(thread, "NCCL BootstrapR"); return ncclSuccess; } ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) { - static_assert(sizeof(union socketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId"); + static_assert(sizeof(union ncclSocketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId"); memset(id, 0, sizeof(ncclUniqueId)); - union socketAddress* connectAddr = (union socketAddress*) id; + union ncclSocketAddress* connectAddr = (union ncclSocketAddress*) id; char* env = getenv("NCCL_COMM_ID"); if (env) { INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env); - if (GetSocketAddrFromString(connectAddr, env) != ncclSuccess) { + if (ncclGetSocketAddrFromString(connectAddr, env) != ncclSuccess) { WARN("Invalid NCCL_COMM_ID, please use format: : or []: or :"); return ncclInvalidArgument; } } else { - memcpy(id, &bootstrapNetIfAddr, sizeof(union socketAddress)); + memcpy(id, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); NCCLCHECK(bootstrapCreateRoot(id, false)); } @@ -223,157 +193,51 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) { struct unexConn { int peer; int tag; - int fd; - union socketAddress addr; + struct ncclSocket sock; struct unexConn* next; }; -// Remote allocator state -struct remAllocState { - int cudaDev; - int listenFd; - volatile int stop; -}; - -struct extState { - int extListenFd; - int extRingRecvFd; - int extRingSendFd; - union socketAddress extRingRecvAddr, extRingSendAddr; - union socketAddress* peerCommAddresses; - union socketAddress* peerAllocAddresses; +struct bootstrapState { + struct ncclSocket listenSock; + struct ncclSocket ringRecvSocket; + struct ncclSocket ringSendSocket; + union ncclSocketAddress* peerCommAddresses; + union ncclSocketAddress* peerProxyAddresses; struct unexConn* unexpectedConnections; int cudaDev; int rank; int nranks; - - // Intermediate memory allocation service - struct remAllocState* allocState; - pthread_t allocThread; + volatile uint32_t *abortFlag; }; -#define MAX_SEGMENTS 128 - -static ncclResult_t remoteAlloc(void** ptr, int fd, union socketAddress *addr) { - size_t size; - NCCLCHECK(socketRecv(fd, addr, &size, sizeof(size_t))); - hipIpcMemHandle_t devIpc; - NCCLCHECK(ncclCudaCalloc((char**)ptr, size)); - hipError_t res = hipIpcGetMemHandle(&devIpc, *ptr); - if (res != hipSuccess) { - WARN("[Rem Allocator] hipIpcGetMemHandle failed : %s", hipGetErrorString(res)); - hipFree(*ptr); - CUDACHECK(res); - } - // The CUDA IPC - NCCLCHECK(socketSend(fd, addr, &devIpc, sizeof(hipIpcMemHandle_t))); - // And the direct pointer - NCCLCHECK(socketSend(fd, addr, ptr, sizeof(void*))); - return ncclSuccess; -} - -#include - -// Service thread to allocate memory for other GPUs, used as intermediate step. -void* ncclRemoteMemAllocationService(void* args) { - struct remAllocState* state = (struct remAllocState *) args; - if (hipSetDevice(state->cudaDev) != hipSuccess) { - WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev); - } - - // Prepare poll descriptor - void* segments[MAX_SEGMENTS]; - struct pollfd pollfds[MAX_SEGMENTS+1]; - for (int s=0; slistenFd; - pollfds[MAX_SEGMENTS].events = POLLIN; - - int nbuffers = 0; - while (state->stop == 0 || (state->stop == 1 && nbuffers > 0)) { - if (int error = poll(pollfds, MAX_SEGMENTS+1, 100/*ms*/) < 0) { - WARN("[Rem Allocator] Poll failed with error %d", error); - return NULL; - } - if (pollfds[MAX_SEGMENTS].revents) { - int s = 0; - union socketAddress addr; - while (segments[s] != NULL && s < MAX_SEGMENTS) s++; - if (bootstrapNetAccept(pollfds[MAX_SEGMENTS].fd, &pollfds[s].fd, &addr) != ncclSuccess) { - pollfds[s].fd = -1; - } else { - if (s == MAX_SEGMENTS || (remoteAlloc(segments+s, pollfds[s].fd, &addr) != ncclSuccess)) { - WARN("[Rem Allocator] Allocation failed (segment %d, fd %d)", s, pollfds[s].fd); - close(pollfds[s].fd); - pollfds[s].fd = -1; - } else { - nbuffers++; - } - } - } - for (int s=0; slistenFd); - free(state); - return NULL; -} - -ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr) { - struct extState* state = (struct extState*)commState; - int fd; - ncclResult_t res; - *id = -1; - union socketAddress *addr = state->peerAllocAddresses+rank; - NCCLCHECK(connectAddress(&fd, addr)); - NCCLCHECKGOTO(socketSend(fd, addr, &size, sizeof(size_t)), res, end); - NCCLCHECKGOTO(socketRecv(fd, addr, ipc, sizeof(hipIpcMemHandle_t)), res, end); - NCCLCHECKGOTO(socketRecv(fd, addr, ptr, sizeof(void*)), res, end); - *id = fd; -end: - return res; -} - -ncclResult_t bootstrapRemFree(int id, int rank, void* commState) { - SYSCHECK(close(id), "close"); - return ncclSuccess; -} - -ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState, int* rootPid) { // [RCCL] Adding rootPid - struct extState* state; +ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) { + int rank = comm->rank; + int nranks = comm->nRanks; + struct bootstrapState* state; NCCLCHECK(ncclCalloc(&state, 1)); state->rank = rank; state->nranks = nranks; - *commState = state; + state->abortFlag = comm->abortFlag; + comm->bootstrap = state; TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); struct extInfo info = { 0 }; info.rank = rank; info.nranks = nranks; - int tmpSendFd, tmpRecvFd; + struct ncclSocket sock, listenSockRoot; + sock.abortFlag = listenSockRoot.abortFlag = comm->abortFlag; + sock.asyncFlag = listenSockRoot.asyncFlag = 0; - int extListenFdRoot; - memcpy(&info.extAddressListen, &bootstrapNetIfAddr, sizeof(union socketAddress)); - memcpy(&info.extAddressListenRoot, &bootstrapNetIfAddr, sizeof(union socketAddress)); - NCCLCHECK(createListenSocket(&state->extListenFd, &info.extAddressListen)); - NCCLCHECK(createListenSocket(&extListenFdRoot, &info.extAddressListenRoot)); + // Create socket for other ranks to contact me + memcpy(&state->listenSock.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketListen(&state->listenSock)); + memcpy(&info.extAddressListen, &state->listenSock.addr, sizeof(union ncclSocketAddress)); + + // Create socket for root to contact me + memcpy(&listenSockRoot.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketListen(&listenSockRoot)); + memcpy(&info.extAddressListenRoot, &listenSockRoot.addr, sizeof(union ncclSocketAddress)); // stagger connection times to avoid an overload of the root if (nranks > 128) { @@ -386,38 +250,36 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS } // send info on my listening socket to root - union socketAddress* rootAddr = (union socketAddress*)id; - NCCLCHECK(connectAddress(&tmpSendFd, rootAddr)); - NCCLCHECK(bootstrapNetSend(tmpSendFd, rootAddr, &info, sizeof(info))); - close(tmpSendFd); + memcpy(&sock.addr, id, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketConnect(&sock)); + NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info))); + close(sock.fd); // get info on my "next" rank in the bootstrap ring from root - union socketAddress addr; - NCCLCHECK(bootstrapNetAccept(extListenFdRoot, &tmpRecvFd, &addr)); - NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &state->extRingSendAddr, sizeof(state->extRingSendAddr))); - { // [RCCL] Receive PID from root - NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, rootPid, sizeof(int))); - } // [/RCCL] - close(tmpRecvFd); - close(extListenFdRoot); + NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot)); + NCCLCHECK(bootstrapNetRecv(&sock, &state->ringSendSocket.addr, sizeof(union ncclSocketAddress))); + close(sock.fd); + close(listenSockRoot.fd); - NCCLCHECK(connectAddress(&state->extRingSendFd, &state->extRingSendAddr)); + NCCLCHECK(ncclSocketConnect(&state->ringSendSocket)); // Accept the connect request from the previous rank in the AllGather ring - NCCLCHECK(bootstrapNetAccept(state->extListenFd, &state->extRingRecvFd, &state->extRingRecvAddr)); + NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock)); // AllGather all listen handlers NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks)); - memcpy(state->peerCommAddresses+rank, &info.extAddressListen, sizeof(union socketAddress)); - NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union socketAddress))); + memcpy(state->peerCommAddresses+rank, &state->listenSock.addr, sizeof(union ncclSocketAddress)); + NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress))); - // Create the memory allocation service - NCCLCHECK(ncclCalloc(&state->peerAllocAddresses, nranks)); - memcpy(state->peerAllocAddresses+rank, &bootstrapNetIfAddr, sizeof(union socketAddress)); - NCCLCHECK(ncclCalloc(&state->allocState, 1)); - CUDACHECK(hipGetDevice(&state->allocState->cudaDev)); - NCCLCHECK(createListenSocket(&state->allocState->listenFd, state->peerAllocAddresses+rank)); - pthread_create(&state->allocThread, NULL, ncclRemoteMemAllocationService, state->allocState); - NCCLCHECK(bootstrapAllGather(state, state->peerAllocAddresses, sizeof(union socketAddress))); + // Create the service proxy + NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks)); + struct ncclSocket* proxySocket; + NCCLCHECK(ncclCalloc(&proxySocket, 1)); + proxySocket->abortFlag = NULL; // proxy is aborted through a message + memcpy(&proxySocket->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketListen(proxySocket)); + memcpy(state->peerProxyAddresses+rank, &proxySocket->addr, sizeof(union ncclSocketAddress)); + NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress))); + NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses)); TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); @@ -425,7 +287,7 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS } ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { - struct extState* state = (struct extState*)commState; + struct bootstrapState* state = (struct bootstrapState*)commState; char* data = (char*)allData; int rank = state->rank; int nranks = state->nranks; @@ -441,9 +303,9 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { size_t sslice = (rank - i + nranks) % nranks; // Send slice to the right - NCCLCHECK(bootstrapNetSend(state->extRingSendFd, &state->extRingSendAddr, data+sslice*size, size)); + NCCLCHECK(bootstrapNetSend(&state->ringSendSocket, data+sslice*size, size)); // Recv slice from the left - NCCLCHECK(bootstrapNetRecv(state->extRingRecvFd, &state->extRingRecvAddr, data+rslice*size, size)); + NCCLCHECK(bootstrapNetRecv(&state->ringRecvSocket, data+rslice*size, size)); } TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); @@ -451,14 +313,15 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { } ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) { - struct extState* state = (struct extState*)commState; - int tmpSendFd; - union socketAddress *addr = state->peerCommAddresses+peer; - NCCLCHECK(connectAddress(&tmpSendFd, addr)); - NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, &state->rank, sizeof(int))); - NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, &tag, sizeof(int))); - NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, data, size)); - close(tmpSendFd); + struct bootstrapState* state = (struct bootstrapState*)commState; + struct ncclSocket sock; + sock.abortFlag = state->abortFlag; + memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketConnect(&sock)); + NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int))); + NCCLCHECK(bootstrapNetSend(&sock, &tag, sizeof(int))); + NCCLCHECK(bootstrapNetSend(&sock, data, size)); + close(sock.fd); return ncclSuccess; } @@ -499,14 +362,13 @@ ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, return ncclSuccess; } -ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd, union socketAddress *addr) { +ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) { // New unex struct unexConn* unex; NCCLCHECK(ncclCalloc(&unex, 1)); unex->peer = peer; unex->tag = tag; - unex->fd = fd; - unex->addr = *addr; + memcpy(&unex->sock, sock, sizeof(struct ncclSocket)); // Enqueue struct unexConn* list = state->unexpectedConnections; @@ -519,7 +381,7 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd return ncclSuccess; } -int unexpectedDequeue(struct extState* state, int peer, int tag, union socketAddress *addr) { +ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) { struct unexConn* elem = state->unexpectedConnections; struct unexConn* prev = NULL; while (elem) { @@ -529,81 +391,72 @@ int unexpectedDequeue(struct extState* state, int peer, int tag, union socketAdd } else { prev->next = elem->next; } - int fd = elem->fd; - *addr = elem->addr; + memcpy(sock, &elem->sock, sizeof(struct ncclSocket)); free(elem); - return fd; + return ncclSuccess; } prev = elem; elem = elem->next; } - return -1; + sock->fd = -1; + return ncclSuccess; } // We can't know who we'll receive from, so we need to receive everything at once ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) { - struct extState* state = (struct extState*)commState; + struct bootstrapState* state = (struct bootstrapState*)commState; - int tmpRecvFd; - union socketAddress addr; + struct ncclSocket sock; + sock.abortFlag = state->abortFlag; // Search unexpected connections first - if ((tmpRecvFd = unexpectedDequeue(state, peer, tag, &addr)) != -1) { - NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, ((char*)data), size)); - close(tmpRecvFd); + NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock)); + if (sock.fd != -1) { + NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size)); + close(sock.fd); return ncclSuccess; } // Then look for new connections while (1) { - union socketAddress addr; - NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd, &addr)); + NCCLCHECK(ncclSocketAccept(&sock, &state->listenSock)); int newPeer, newTag; - NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &newPeer, sizeof(int))); - NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &newTag, sizeof(int))); + NCCLCHECK(bootstrapNetRecv(&sock, &newPeer, sizeof(int))); + NCCLCHECK(bootstrapNetRecv(&sock, &newTag, sizeof(int))); if (newPeer == peer && newTag == tag) { - NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, ((char*)data), size)); - close(tmpRecvFd); + NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size)); + close(sock.fd); return ncclSuccess; } // Unexpected connection. Save for later. - NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, tmpRecvFd, &addr)); + NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, &sock)); } } ncclResult_t bootstrapClose(void* commState) { - struct extState* state = (struct extState*)commState; + struct bootstrapState* state = (struct bootstrapState*)commState; if (state->unexpectedConnections != NULL) { WARN("Unexpected connections are not empty"); return ncclInternalError; } - close(state->extListenFd); - close(state->extRingSendFd); - close(state->extRingRecvFd); - - state->allocState->stop = 1; - - // Join the allocThread so we catch resource leaks as being hung here - // [RCCL] Uncommenting this join to clean up the allocThread - pthread_join(state->allocThread, nullptr); - // [/RCCL] + close(state->listenSock.fd); + close(state->ringSendSocket.fd); + close(state->ringRecvSocket.fd); free(state->peerCommAddresses); - free(state->peerAllocAddresses); free(state); return ncclSuccess; } ncclResult_t bootstrapAbort(void* commState) { - struct extState* state = (struct extState*)commState; + struct bootstrapState* state = (struct bootstrapState*)commState; if (commState == NULL) return ncclSuccess; - if (state->extListenFd) close(state->extListenFd); - if (state->extRingSendFd) close(state->extRingSendFd); - if (state->extRingRecvFd) close(state->extRingRecvFd); - if (state->allocState) state->allocState->stop = 2; + if (state->listenSock.fd) close(state->listenSock.fd); + if (state->ringSendSocket.fd) close(state->ringSendSocket.fd); + if (state->ringRecvSocket.fd) close(state->ringRecvSocket.fd); free(state->peerCommAddresses); - free(state->peerAllocAddresses); + free(state->peerProxyAddresses); free(state); return ncclSuccess; } diff --git a/src/channel.cc b/src/channel.cc index 5fa25c7bce..e9cfa6664f 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -65,13 +65,13 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { for (int r=0; rpeers+r; for (int b=0; bsend[b].transportResources) NCCLCHECK(peer->send[b].transportComm->free(peer->send[b].transportResources)); + if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b)); } } for (int r=0; rpeers+r; for (int b=0; brecv[b].transportResources) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv[b].transportResources)); + if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b)); } } diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index 7a1e3e1da7..86ed853632 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -13,9 +13,9 @@ namespace { template __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem->channel.ring; const int *ringRanks = ring->devUserRanks; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1)); @@ -23,12 +23,12 @@ namespace { const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); const int nranks = ncclShmem->comm.nRanks; const ssize_t loopSize = nChannels*int(chunkSize); - const ssize_t size = args->coll.count; + const ssize_t size = args->count; T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; - Primitives, 0, Proto> - prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->coll.redOpArg, args->coll.connIndex << 16); + Primitives, 0, Proto, 0> prims + (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, args->connIndex << 16); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; @@ -37,7 +37,7 @@ namespace { realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) - realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize; + realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128); realChunkSize = int(realChunkSize); diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu index a22caaa989..e7c3c28cfb 100644 --- a/src/collectives/device/all_reduce.cu +++ b/src/collectives/device/all_reduce.cu @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. -* Modifications Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. + * * See LICENSE.txt for license information ************************************************************************/ @@ -8,7 +8,4 @@ #include "common.h" #include "collectives.h" -// [RCCL] -// IMPL_COLL_R(AllReduce); -IMPL_COLL_CLIQUE(AllReduce); -// [/RCCL] +IMPL_COLL_R(AllReduce); diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index 6398388ed9..c92ce89d33 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,26 +8,26 @@ #include "devcomm.h" #include "collectives.h" #include "primitives.h" -#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support +//#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support namespace { template __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem->channel.ring; int ringIx = ring->index; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1)); const int nranks = ncclShmem->comm.nRanks; const ssize_t loopSize = nChannels*nranks*chunkSize; - const ssize_t size = args->coll.count; #ifdef ENABLE_PROFILING auto devProf = ncclShmem->comm.devProf; uint64_t clk, t0 = 0ULL, ws; if (tid == 0) clk = __builtin_amdgcn_s_memrealtime(); #endif + const ssize_t size = args->count; int minChunkSize; if (Proto::Id == NCCL_PROTO_LL) @@ -37,8 +37,8 @@ namespace { minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2; } - Primitives, 0, Proto> prims - (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg, args->coll.connIndex << 16); + Primitives, 0, Proto, 0> prims + (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; @@ -110,32 +110,35 @@ namespace { ACCUMULATE_COUNTER(directRecv); } #ifdef ENABLE_PROFILING - if (tid == 0 && args->coll.opCount) devProf->elems[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk); + if (tid == 0) { + struct ncclProfElem *elem = devProf.elems+args->opCount; + elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk); + } #endif } template __device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclTree *tree = &ncclShmem->channel.tree; ssize_t chunkSize = int( - Proto::Id == NCCL_PROTO_SIMPLE ? args->coll.lastChunkSize + Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize /* LL & LL128 */ : Proto::calcBytePerStep()/sizeof(T)); const ssize_t minChunkSize = int( Proto::Id == NCCL_PROTO_SIMPLE ? nthreads*8*(sizeof(uint64_t)/sizeof(T)) /* LL & LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T))); const ssize_t loopSize = int(nChannels*chunkSize); - const ssize_t size = args->coll.count; + const ssize_t size = args->count; if (loopSize > size) chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize); { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) - Primitives, /*Direct=*/0, Proto> prims - (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg); + Primitives, /*Direct=*/0, Proto, 0> prims + (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg); if (tree->up == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); @@ -160,8 +163,8 @@ namespace { } { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) - Primitives, /*Direct=*/0, Proto> prims - (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg); + Primitives, /*Direct=*/0, Proto, 0> prims + (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg); if (tree->up == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); @@ -189,19 +192,19 @@ namespace { template __device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclTree *tree = &ncclShmem->channel.tree; ssize_t chunkSize = int( - Proto::Id != NCCL_PROTO_LL ? args->coll.lastChunkSize + Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize : Proto::calcBytePerStep()/sizeof(T)); const ssize_t minChunkSize = int( Proto::Id == NCCL_PROTO_SIMPLE ? nthreads*8*(sizeof(uint64_t)/sizeof(T)) : Proto::Id == NCCL_PROTO_LL ? nthreads*(Proto::calcBytePerGrain()/sizeof(T)) /* LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T))/8); const ssize_t loopSize = int(nChannels*chunkSize); - const ssize_t size = args->coll.count; + const ssize_t size = args->count; int nthreadsSplit; if (Proto::Id == NCCL_PROTO_SIMPLE) { @@ -218,8 +221,8 @@ namespace { if (tree->up == -1) { // Reduce and broadcast. Max number of recv is 3, max number of send is 3 - Primitives, /*Direct=*/0, Proto> - prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg); + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); @@ -235,8 +238,8 @@ namespace { * into DirectRecv and DirectSend capabilities, this ctor would have both=0, * but the ctor above for tree roots would be DirectRecv=0 DirectSend=1. */ - Primitives, /*Direct=*/0, Proto> - prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg, 0*Proto::MaxGroupWidth); + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth); if (tree->down[0] == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); @@ -254,8 +257,8 @@ namespace { } else { // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local) - Primitives, /*Direct=*/0, Proto> - prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg, 1*Proto::MaxGroupWidth); + Primitives, /*Direct=*/0, Proto, 0> + prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth); if (tree->down[0] == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); @@ -294,11 +297,11 @@ struct RunWorkElementcoll.bid; - const int nChannels = args->coll.nChannels; + const int bid = args->bid; + const int nChannels = args->nChannels; struct ncclDirect* tree = &ncclShmem->channel.collTree; - const ssize_t chunkSize = int(args->coll.lastChunkSize); - const ssize_t size = args->coll.count; + const ssize_t chunkSize = int(args->lastChunkSize); + const ssize_t size = args->count; const ssize_t loopSize = nChannels*tree->nHeads*chunkSize; const int hasUp = (tree->up[0] >= 0) ? 1 : 0; @@ -306,7 +309,7 @@ struct RunWorkElementnThreads - nThreadsScatter - nThreadsGather - nThreadsBcast; + const int nThreadsReduce = args->header.nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast; const int tidStartBcast = nThreadsGather; const int tidStartScatter = tidStartBcast + nThreadsBcast; const int tidStartReduce = tidStartScatter + nThreadsScatter; @@ -316,8 +319,8 @@ struct RunWorkElement= tidStartScatter && tid < tidStartReduce && hasUp) { // Scatter int group = (2*Proto::MaxGroupWidth) | (1<<16); - Primitives, /*Direct=*/0, Proto> - prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args); + Primitives, /*Direct=*/0, Proto, 0> + prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize; int nelem = min(tree->nHeads*chunkSize, size-offset); @@ -331,8 +334,8 @@ struct RunWorkElement, /*Direct=*/0, Proto> - prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args); + Primitives, /*Direct=*/0, Proto, 0> + prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); @@ -344,8 +347,8 @@ struct RunWorkElement, /*Direct=*/0, Proto> - prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->coll.redOpArg, group); + Primitives, /*Direct=*/0, Proto, 0> + prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); @@ -355,8 +358,8 @@ struct RunWorkElement, /*Direct=*/0, Proto> - prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args); + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize; int nelem = min(tree->nHeads*chunkSize, size-offset); @@ -366,8 +369,8 @@ struct RunWorkElement, /*Direct=*/0, Proto> - prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args); + Primitives, /*Direct=*/0, Proto, 0> + prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); @@ -375,8 +378,8 @@ struct RunWorkElement, /*Direct=*/0, Proto> - prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->coll.redOpArg, group); + Primitives, /*Direct=*/0, Proto, 0> + prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); @@ -404,13 +407,15 @@ struct RunWorkElement struct RunWorkElement { __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { - LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args); + runRing(args); + //LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args); } }; template struct RunWorkElement { __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { - LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args); + runTreeSplit(args); + //LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args); } }; diff --git a/src/collectives/device/alltoall_pivot.h b/src/collectives/device/alltoall_pivot.h index a6740cd3e5..4b00ddbca8 100644 --- a/src/collectives/device/alltoall_pivot.h +++ b/src/collectives/device/alltoall_pivot.h @@ -12,24 +12,24 @@ namespace { template __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; const int nranks = ncclShmem->comm.nRanks; const ncclRing *ring = &ncclShmem->channel.ring; - const int num_bi_rings = args->coll.pivotA2ANumBiRings; + const int num_bi_rings = args->pivotA2ANumBiRings; const int num_uni_rings = num_bi_rings * 2; - const int num_chunks = args->coll.nChannels / 2; + const int num_chunks = args->nChannels / 2; const int chunk_id = (bid % num_bi_rings) + (bid / num_uni_rings * num_bi_rings); - const int elem_size = args->coll.count % 256 ? 1 : 256; - const ssize_t num_elems = args->coll.count / elem_size; + const int elem_size = args->count % 256 ? 1 : 256; + const ssize_t num_elems = args->count / elem_size; const int num_padding_chunks = num_elems % num_chunks; const ssize_t chunk_offset = elem_size * (num_elems / num_chunks * chunk_id + (chunk_id < num_padding_chunks ? chunk_id : num_padding_chunks)); const ssize_t chunk_size = elem_size * (num_elems / num_chunks + (chunk_id < num_padding_chunks ? 1 : 0)); const int pivot_direction = (bid % num_uni_rings) / num_bi_rings; const ssize_t prims_size = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLTOALL_PIVOT_CHUNKSTEPS : 1)); - Primitives, 0, Proto> prims - (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0, args->coll.connIndex << 16); + Primitives, 0, Proto, 0> prims + (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0, args->connIndex << 16); for (int num_hops = 0; num_hops <= nranks / 2; num_hops++) { const int src_rank = ring->devUserRanks[(nranks - num_hops) % nranks]; diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index be01d90836..a97836c672 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,27 +12,27 @@ namespace { template __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem->channel.ring; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1)); const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); const ssize_t loopSize = nChannels*chunkSize; - const ssize_t size = args->coll.count; + const ssize_t size = args->count; const int rank = ring->devUserRanks[0]; const int nextRank = ring->devUserRanks[1]; - const int root = args->coll.root; #ifdef ENABLE_PROFILING auto devProf = ncclShmem->comm.devProf; uint64_t clk, t0 = 0ULL, ws; if (tid == 0) clk = __builtin_amdgcn_s_memrealtime(); #endif + const int root = args->root; T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; - Primitives, 0, Proto> - prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->coll.redOpArg, args->coll.connIndex << 16); + Primitives, 0, Proto, 0> + prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, args->connIndex << 16); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; @@ -41,7 +41,7 @@ namespace { realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) - realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize; + realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128); realChunkSize = int(realChunkSize); @@ -70,7 +70,10 @@ namespace { } } #ifdef ENABLE_PROFILING - if (tid == 0 && args->coll.opCount) devProf->elems[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk); + if (tid == 0) { + struct ncclProfElem *elem = devProf.elems+args->opCount; + elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk); + } #endif } } diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index b484e0e3c7..084612f9a2 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,6 +10,7 @@ #include "collectives.h" #include "devcomm.h" +#include "op128.h" #define COLL_UNROLL 2 #define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree @@ -67,40 +68,8 @@ NCCL_FUNCS3B(func, Sum), \ NCCL_FUNCS3B(func, Sum) -// [RCCL] Adding clique-based kernels for AllReduce, in-place of unused RingLL28 kernels -#define NCCL_FUNC5B(func, algo, devredop, type, nullify) \ - MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \ - MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128, devredop, type)), \ - MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type)) - -#define NCCL_FUNC4B(func, devredop, type, nullify) \ - NCCL_FUNC5B(func, TREE, devredop, type, nullify), \ - NCCL_FUNC5B(func, RING, devredop, type, nullify), \ - NCCL_FUNC5B(func, COLLNET, devredop, type, nullify) - -#define NCCL_FUNCS3C(func, devredop, nullForFloat) \ - NCCL_FUNC4B(func, devredop, int8_t, 0), \ - NCCL_FUNC4B(func, devredop, uint8_t, 0), \ - NCCL_FUNC4B(func, devredop, int32_t, 0), \ - NCCL_FUNC4B(func, devredop, uint32_t, 0), \ - NCCL_FUNC4B(func, devredop, int64_t, 0), \ - NCCL_FUNC4B(func, devredop, uint64_t, 0), \ - NCCL_FUNC4B(func, devredop, half, nullForFloat), \ - NCCL_FUNC4B(func, devredop, float, nullForFloat), \ - NCCL_FUNC4B(func, devredop, double, nullForFloat), \ - NCCL_FUNC4B(func, devredop, rccl_bfloat16, nullForFloat) - -#define NCCL_FUNCS2C(func) \ - NCCL_FUNCS3C(func, Sum, /*nullForFloat=*/0), \ - NCCL_FUNCS3C(func, Prod, /*nullForFloat=*/0), \ - NCCL_FUNCS3C(func, Max, /*nullForFloat=*/0), \ - NCCL_FUNCS3C(func, Min, /*nullForFloat=*/0), \ - NCCL_FUNCS3C(func, PreMulSum, /*nullForFloat=*/0), \ - NCCL_FUNCS3C(func, SumPostDiv, /*nullForFloat=*/1) - - // Must be consistent with the ncclFuncSet enum -using ncclKernelFunc_t = void (*)(struct ncclWorkElem* args); +using ncclKernelFunc_t = void (*)(); static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{ // Don't try to initialize the host shadow copy of this device-side global @@ -108,13 +77,13 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{ // confuses clang. This will be fixed in the next clang release. #if defined(__HIP_DEVICE_COMPILE__) #if defined(BUILD_ALLREDUCE_ONLY) - NCCL_FUNC4B(AllReduce, Sum, float, 0), + NCCL_FUNC4(AllReduce, Sum, float, 0), #else NCCL_FUNCS2B(Broadcast), NCCL_FUNCS2A(Reduce), NCCL_FUNCS2B(AllGather), NCCL_FUNCS2A(ReduceScatter), - NCCL_FUNCS2C(AllReduce), + NCCL_FUNCS2A(AllReduce), NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t), NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t), NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t), @@ -136,18 +105,18 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{ template struct Caller { static __device__ __host__ - void call(struct ncclWorkElem* const c) noexcept + void call(unsigned short funcIndex) noexcept { constexpr unsigned short m = f + (l - f) / 2; - return (c->funcIndex < m) ? Caller::call(c) : Caller::call(c); + return (funcIndex < m) ? Caller::call(funcIndex) : Caller::call(funcIndex); } }; template struct Caller{ static __device__ __host__ - void call(struct ncclWorkElem* const c) noexcept { ncclFuncs[f](c); } + void call(unsigned short funcIndex) noexcept { ncclFuncs[f](); } }; static_assert(FUNC_INDEX_P2P == 2710, "Wrong P2P function index"); @@ -155,86 +124,86 @@ static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 2711, "Wrong AllToAllPivot function i inline __device__ -void NCCL_CALL_FUNCTIONS(struct ncclWorkElem* const c) noexcept { +void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept { #if defined(BUILD_ALLREDUCE_ONLY) - if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE)) - ncclFunction_AllReduce_RING_SIMPLE_Sum_float(c); - else if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_LL)) - ncclFunction_AllReduce_RING_LL_Sum_float(c); - else if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_LL128)) - ncclFunction_AllReduce_RING_LL128_Sum_float(c); - else if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE)) - ncclFunction_AllReduce_TREE_SIMPLE_Sum_float(c); - else if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_LL)) - ncclFunction_AllReduce_TREE_LL_Sum_float(c); - else if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE)) - ncclFunction_AllReduce_COLLNET_SIMPLE_Sum_float(c); - else if (c->funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET, NCCL_PROTO_LL)) - ncclFunction_AllReduce_COLLNET_LL_Sum_float(c); + if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE)) + ncclFunction_AllReduce_RING_SIMPLE_Sum_float(); + else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_LL)) + ncclFunction_AllReduce_RING_LL_Sum_float(); + else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_LL128)) + ncclFunction_AllReduce_RING_LL_Sum_float(); + else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE)) + ncclFunction_AllReduce_TREE_SIMPLE_Sum_float(); + else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_LL)) + ncclFunction_AllReduce_TREE_LL_Sum_float(); + else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE)) + ncclFunction_AllReduce_COLLNET_SIMPLE_Sum_float(); + else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET, NCCL_PROTO_LL)) + ncclFunction_AllReduce_COLLNET_LL_Sum_float(); else assert("Unsupported function index"); #else - if (c->funcIndex < 540) { - if (c->funcIndex % 9 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c); - else if (c->funcIndex % 9 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(c); - else if (c->funcIndex % 9 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t(c); - else if (c->funcIndex % 9 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c); - else if (c->funcIndex % 9 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t(c); - else if (c->funcIndex % 9 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t(c); - else if (c->funcIndex % 9 == 6) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c); - else if (c->funcIndex % 9 == 7) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(c); - else ncclFunction_Broadcast_COLLNET_SIMPLE_Sum_int8_t(c); + if (funcIndex < 540) { + if (funcIndex % 9 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(); + else if (funcIndex % 9 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t(); + else if (funcIndex % 9 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t(); + else if (funcIndex % 9 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t(); + else if (funcIndex % 9 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t(); + else if (funcIndex % 9 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t(); + else if (funcIndex % 9 == 6) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(); + else if (funcIndex % 9 == 7) ncclFunction_Broadcast_COLLNET_LL_Sum_int8_t(); + else ncclFunction_Broadcast_COLLNET_SIMPLE_Sum_int8_t(); } - else if (c->funcIndex < 1080) Caller<540, 1080>::call(c); - else if (c->funcIndex < 1620) { - if (c->funcIndex % 9 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c); - else if (c->funcIndex % 9 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t(c); - else if (c->funcIndex % 9 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t(c); - else if (c->funcIndex % 9 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t(c); - else if (c->funcIndex % 9 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t(c); - else if (c->funcIndex % 9 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t(c); - else if (c->funcIndex % 9 == 6) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c); - else if (c->funcIndex % 9 == 7) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(c); - else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t(c); + else if (funcIndex < 1080) Caller<540, 1080>::call(funcIndex); + else if (funcIndex < 1620) { + if (funcIndex % 9 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t(); + else if (funcIndex % 9 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t(); + else if (funcIndex % 9 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t(); + else if (funcIndex % 9 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t(); + else if (funcIndex % 9 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t(); + else if (funcIndex % 9 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t(); + else if (funcIndex % 9 == 6) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(); + else if (funcIndex % 9 == 7) ncclFunction_AllGather_COLLNET_LL_Sum_int8_t(); + else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t(); } - else if (c->funcIndex < 2700) Caller<1620, 2700>::call(c); + else if (funcIndex < 2700) Caller<1620, 2700>::call(funcIndex); else { - switch (c->funcIndex - 2700) { + switch (funcIndex - 2700) { case 0: - ncclFunction_OneRankReduce_PreMulSum_int8_t(c); + ncclFunction_OneRankReduce_PreMulSum_int8_t(); break; case 1: - ncclFunction_OneRankReduce_PreMulSum_uint8_t(c); + ncclFunction_OneRankReduce_PreMulSum_uint8_t(); break; case 2: - ncclFunction_OneRankReduce_PreMulSum_int32_t(c); + ncclFunction_OneRankReduce_PreMulSum_int32_t(); break; case 3: - ncclFunction_OneRankReduce_PreMulSum_uint32_t(c); + ncclFunction_OneRankReduce_PreMulSum_uint32_t(); break; case 4: - ncclFunction_OneRankReduce_PreMulSum_int64_t(c); + ncclFunction_OneRankReduce_PreMulSum_int64_t(); break; case 5: - ncclFunction_OneRankReduce_PreMulSum_uint64_t(c); + ncclFunction_OneRankReduce_PreMulSum_uint64_t(); break; case 6: - ncclFunction_OneRankReduce_PreMulSum_half(c); + ncclFunction_OneRankReduce_PreMulSum_half(); break; case 7: - ncclFunction_OneRankReduce_PreMulSum_float(c); + ncclFunction_OneRankReduce_PreMulSum_float(); break; case 8: - ncclFunction_OneRankReduce_PreMulSum_double(c); + ncclFunction_OneRankReduce_PreMulSum_double(); break; case 9: - ncclFunction_OneRankReduce_PreMulSum_rccl_bfloat16(c); + ncclFunction_OneRankReduce_PreMulSum_rccl_bfloat16(); break; case 10: - ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c); + ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(); break; case 11: - ncclFunction_AllToAllPivot_RING_SIMPLE_Sum_int8_t(c); + ncclFunction_AllToAllPivot_RING_SIMPLE_Sum_int8_t(); default: break; } @@ -249,45 +218,49 @@ class ncclFunction { }; #ifdef ENABLE_COLLTRACE -#define traceColl(fIdx) \ - uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ - shmem.comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \ - shmem.comm.collTrace[pos].bid = bid; \ - shmem.comm.collTrace[pos].funcIndex = fIdx; \ - if (fIdx == FUNC_INDEX_P2P) { \ - shmem.comm.collTrace[pos].opCount = elems[0].p2p.opCount; \ - shmem.comm.collTrace[pos].p2p.nThreads = elems[0].p2p.nThreads; \ - shmem.comm.collTrace[pos].p2p.delta = (uint16_t)(elems[0].p2p.delta); \ +#define traceColl(elem,launch_type) \ + uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ + ncclShmem->comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \ + ncclShmem->comm.collTrace[pos].bid = blockIdx.x; \ + ncclShmem->comm.collTrace[pos].funcIndex = ncclShmem->work.header.funcIndex; \ + asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (ncclShmem->comm.collTrace[pos].data_0)); \ + if (elem.header.type == ncclWorkTypeP2p) { \ + struct ncclWorkElemP2p *p2pElems = (struct ncclWorkElemP2p *)&elem; \ + ncclShmem->comm.collTrace[pos].p2p[0].connIndex = p2pElems[0].connIndex; \ + ncclShmem->comm.collTrace[pos].p2pOpCount[0] = p2pElems[0].opCount; \ + ncclShmem->comm.collTrace[pos].p2p[0].ngroups = p2pElems[0].ngroups; \ + ncclShmem->comm.collTrace[pos].p2p[0].nWarps = p2pElems[0].nWarps; \ + ncclShmem->comm.collTrace[pos].p2p[0].warpStart = p2pElems[0].warpStart; \ + ncclShmem->comm.collTrace[pos].p2p[0].peer = (uint16_t)(p2pElems[0].peer); \ + ncclShmem->comm.collTrace[pos].p2p[1].connIndex = p2pElems[1].connIndex; \ + ncclShmem->comm.collTrace[pos].p2pOpCount[1] = p2pElems[1].opCount; \ + ncclShmem->comm.collTrace[pos].p2p[1].ngroups = p2pElems[1].ngroups; \ + ncclShmem->comm.collTrace[pos].p2p[1].nWarps = p2pElems[1].nWarps; \ + ncclShmem->comm.collTrace[pos].p2p[1].warpStart = p2pElems[1].warpStart; \ + ncclShmem->comm.collTrace[pos].p2p[1].peer = (uint16_t)(p2pElems[1].peer); \ + ncclShmem->comm.collTrace[pos].type = (ncclCollTraceP2pElemType|launch_type); \ } else { \ - shmem.comm.collTrace[pos].opCount = elems[0].coll.opCount; \ - shmem.comm.collTrace[pos].coll.nThreads = elems[0].nThreads; \ - shmem.comm.collTrace[pos].coll.bid = elems[0].coll.bid; \ - shmem.comm.collTrace[pos].coll.nChannels = elems[0].coll.nChannels; \ + ncclShmem->comm.collTrace[pos].opCount = elem.opCount; \ + ncclShmem->comm.collTrace[pos].coll.nWarps = elem.header.nWarps; \ + ncclShmem->comm.collTrace[pos].coll.bid = elem.bid; \ + ncclShmem->comm.collTrace[pos].coll.nChannels = elem.nChannels; \ + ncclShmem->comm.collTrace[pos].type = (ncclCollTraceCollElemType|launch_type); \ } -#define traceKernelLaunch(fIdx) { \ - if (!(fIdx == FUNC_INDEX_P2P && elems[0].p2p.nThreads == 0)) { \ - traceColl(fIdx); \ - asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (shmem.comm.collTrace[pos].data_0)); \ - shmem.comm.collTrace[pos].type = ncclCollTraceKernelLaunchType; \ - } \ + +#define traceKernelLaunch(elem,firstLaunch) { \ + traceColl(elem,(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType)); \ } -#define traceCollEnd(fIdx) { \ - if (!(fIdx == FUNC_INDEX_P2P && elems[0].p2p.nThreads == 0)) { \ - traceColl(fIdx); \ - shmem.comm.collTrace[pos].type = ncclCollTraceCollEndType; \ - } \ +#define traceKernelEnd() { \ + uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ + ncclShmem->comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \ + ncclShmem->comm.collTrace[pos].bid = bid; \ + ncclShmem->comm.collTrace[pos].type = ncclCollTraceKernelEndType; \ } -#define traceKernelEnd(fIdx) { \ - if (!(fIdx == FUNC_INDEX_P2P && elems[0].p2p.nThreads == 0)) { \ - traceColl(fIdx); \ - shmem.comm.collTrace[pos].type = ncclCollTraceKernelEndType; \ - } \ - } -#define traceAbort(fIdx) { \ - if (!(fIdx == FUNC_INDEX_P2P && elems[0].p2p.nThreads == 0)) { \ - traceColl(fIdx); \ - shmem.comm.collTrace[pos].type = ncclCollTraceAbortType; \ - } \ +#define traceAbort() { \ + uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ + ncclShmem->comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \ + ncclShmem->comm.collTrace[pos].bid = bid; \ + ncclShmem->comm.collTrace[pos].type = ncclCollTraceAbortType; \ } // traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1) #define traceData(data2, data4, data8_0, data8_1) { \ @@ -301,9 +274,8 @@ class ncclFunction { ncclShmem->comm.collTrace[pos].type = ncclCollTraceDataType; \ } #else -#define traceKernelLaunch(fIdx) -#define traceCollEnd(fIdx) -#define traceAbort(fIdx) +#define traceKernelLaunch() +#define traceAbort() #define traceData(data2, data4, data8_0, data8_1) #endif @@ -313,6 +285,28 @@ __device__ inline bool barrierReduceAny(int bit, uint32_t* abortCount) { return atomicAdd(abortCount, 0) != 0; } +// Copy src to dst and fill extra size with zeroes +template +__device__ void copyToShmem(Tdst *dst, Tsrc const *src, int tid, int nthreads) { + static_assert(sizeof(Tdst)%(2*sizeof(uint64_t)) == 0 && sizeof(Tsrc)%(2*sizeof(uint64_t)) == 0, + "copyToShmem needs sizes which are multiple of 16B"); + static_assert(sizeof(Tdst) >= sizeof(Tsrc), "Tdst size is too small"); + static_assert(sizeof(Tdst) <= WARP_SIZE*2*sizeof(uint64_t), "copyToShmem limited to 512B to make sure it can always be done in one cycle"); + uint64_t *d = reinterpret_cast(dst); + uint64_t const *s = reinterpret_cast(src); + uint64_t *shmemPtr = d; + int offset = 2*tid; + uint64_t v0, v1; + if (offset >= sizeof(Tsrc)/sizeof(uint64_t)) { + v0 = v1 = 0ULL; + } else { + v0 = s[offset] ; v1 = s[offset+1]; + } + if (offset < sizeof(Tdst)/sizeof(uint64_t)) { + shmemPtr[offset] = v0; shmemPtr[offset+1] = v1; + } +} + template __device__ int copyToShmem(T *dst, T const *src, int turn=0) { static_assert(sizeof(uint64_t) <= alignof(T), "Uhoh"); @@ -352,47 +346,21 @@ struct RunWorkElement { } }; -#if CUDART_VERSION >= 11030 -__device__ constexpr int ncclWorkElemFactors[NCCL_NUM_ALGORITHMS] = -#else -static __device__ __constant__ int ncclWorkElemFactors[NCCL_NUM_ALGORITHMS] = -#endif -{/*Tree*/1, /*Ring and P2P*/1, /*CollNet*/NCCL_REG_ELEM_FACTOR}; - template struct RunWork { // This __forceinline__ is necessary. The compiler was inserting a function call // here from the LL ncclKernel. __device__ __forceinline__ void run(ncclWork *w) { - int tid = threadIdx.x; - /* Some invariants that must hold: - * 1. All elems[] have same funcIndex. - * 2. All elems[] have same nThreads. - * 3. The thread-to-group relation (as in prims group numbers) is the same - * for all elems[]. - * - * If (1) isn't true then we might be in the wrong function since dispatch - * on ncclFuncs[w->funcIndex] is how we got here. - * - * If (2) or (3) aren't true, then threads from different work elements - * could race for barrier resources (barrier numbers 0...15) which is fatal. - * - * IMPORTANT!!! To ensure (3), implementations of - * `RunWorkElement::run()` may only use the following - * when deciding how to map threads to groups: - * Fn, T, RedOp, Algo, Proto, nThreads - * - * This last one is difficult to enforce so I hope everyone reads this. - */ - if (tid < w->elems[0].nThreads) { - #pragma unroll 1 - for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e+=ncclWorkElemFactors[Algo]) + int wid = threadIdx.x / WARP_SIZE; + int inc = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) / sizeof(ncclWorkElem) : 1; + #pragma unroll 1 + for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e += inc) { + if (wid < w->header.nWarps) RunWorkElement().run(&w->elems[e]); } } }; -#define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE) struct ncclShmemGroup { ncclConnInfo *recvConns[NCCL_MAX_DIRECT_ARITY]; ncclConnInfo *sendConns[NCCL_MAX_DIRECT_ARITY]; @@ -400,7 +368,7 @@ struct ncclShmemGroup { void* dsts[NCCL_MAX_DIRECT_ARITY+1]; int totalSendSize[NCCL_MAX_SLICE_PER_CHUNK]; uint64_t barrier; - uint64_t barrier_next[MAXWARPS]; + uint64_t barrier_next[NCCL_MAX_GROUPS]; }; struct ncclShmemData { @@ -408,18 +376,41 @@ struct ncclShmemData { uint64_t ll128warp[NCCL_MAX_GROUPS][NCCL_MAX_GROUPS]; struct ncclShmemGroup groups[NCCL_MAX_GROUPS]; }; - uint32_t sync[MAXWARPS]; + uint32_t sync[NCCL_MAX_GROUPS]; uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1]; - ncclDevComm comm; - ncclChannel channel; - ncclWork work; + struct ncclDevComm comm; + struct ncclChannel channel; + uint64_t pad; + struct ncclWork work; }; +static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) { + if (we->header.type != ncclWorkTypeUnused && we->redOpArgIsPtr) { + /* redOpArg is a pointer to the scalar value, so we'll dereference it + * here so that redOpArg holds the bits of the scalar going forward. + * The tricky thing is we don't know its type T since that's encoded in + * the funcIndex. Because it would be difficult to get sizeof(T) from + * funcIndex, we'll cheat and just dereference the largest possible size + * given the alignment of the pointer. We might be reading in more bytes + * than we need but that's harmless. + */ + if (we->redOpArg%2 != 0) + we->redOpArg = *reinterpret_cast(we->redOpArg); + else if (we->redOpArg%4 != 0) + we->redOpArg = *reinterpret_cast(we->redOpArg); + else if (we->redOpArg%8 != 0) + we->redOpArg = *reinterpret_cast(we->redOpArg); + else + we->redOpArg = *reinterpret_cast(we->redOpArg); + } +} + extern __device__ struct ncclShmemData *ncclShmem; template -__device__ void ncclKernel(ncclWorkElem first) { +__device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) { int tid = threadIdx.x; + int nthreads = blockDim.x; int bid = blockIdx.x; __shared__ struct ncclShmemData shmem; ncclShmem = &shmem; @@ -428,110 +419,90 @@ __device__ void ncclKernel(ncclWorkElem first) { abortCount = 0; for (auto i = 0; i < NCCL_MAX_GROUPS; i++) { shmem.groups[i].barrier = 0; - for (auto j = 0; j < MAXWARPS; j++) shmem.groups[i].barrier_next[j] = 0; + for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0; } } __syncthreads(); - int turn = copyToShmem(&shmem.comm, first.comm); + int turn = copyToShmem(&ncclShmem->comm, comm); // get address of channel without incurring indirect load from ncclDevCom::channels - ncclChannel *channel = &((ncclDevCommAndChannels*)first.comm)->channels[bid]; - turn = copyToShmem(&shmem.channel, channel, turn); + ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid]; + turn = copyToShmem(&ncclShmem->channel, channel, turn); // To optimize for latency, (only) the first operation is passed as argument. - if (bid == 0 && first.active != 0) { - turn = copyToShmem(&shmem.work.elems[0], &first, turn); - if (1 <= tid && tid < NCCL_MAX_WORK_ELEMENTS && tid % ncclWorkElemFactors[Algo] == 0) { - shmem.work.elems[tid].active = 0; - shmem.work.elems[tid].redOpArgIsPtr = 0; - } + if (bid == 0 && first.header.type != ncclWorkTypeUnused) { + // Copy first elem to work and zero out the rest + copyToShmem(&ncclShmem->work, &first, tid, nthreads); } - struct ncclWorkElem* elems = shmem.work.elems; - __syncthreads(); // publish shmem + __syncthreads(); // publish ncclShmem - ncclWork *workFifoHost = shmem.channel.workFifo; - ncclWork *workFifoDev = shmem.channel.workFifoDev; - int workFifoIx = shmem.channel.index; + ncclWork *workFifoHost = ncclShmem->channel.workFifo; + ncclWork *workFifoDev = ncclShmem->channel.workFifoDev; + int workFifoIx = ncclShmem->channel.index; bool skipLoadWork = false, firstLaunch = true; - if (bid == 0 && first.active != 0) + if (bid == 0 && first.header.type != ncclWorkTypeUnused) skipLoadWork = true; while (true) { if (!skipLoadWork) { - copyToShmem(&shmem.work, &workFifoDev[workFifoIx]); // turn no longer helps - // Check whether the last operation was aborted and make sure all threads exit - int aborted = tid == 0 ? *shmem.comm.abortFlag : 0; - if (barrierReduceAny(aborted, &abortCount)) { // publish shmem.work - if (COLLTRACE && tid == 0) traceAbort(elems->funcIndex); - break; + copyToShmem(&ncclShmem->work, &workFifoDev[workFifoIx], tid, nthreads); + { // Check whether the last operation was aborted and make sure all threads exit + int aborted = tid == 0 ? *comm->abortFlag : 0; + if (barrierReduceAny(aborted, &abortCount)) { // publish ncclShmem->work + if (COLLTRACE && tid == 0) traceAbort(); + break; + } + if (tid == 0) + workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused; } - if (tid == 0) - workFifoHost[workFifoIx].elems[0].active = 0; - if (COLLTRACE && tid == 0) { - if (firstLaunch) traceKernelLaunch(elems->funcIndex); - if (!firstLaunch) traceCollEnd(elems->funcIndex); - firstLaunch = false; - } - } else if (COLLTRACE && tid == 0) { - traceKernelLaunch(elems->funcIndex); - firstLaunch = false; } workFifoIx = (workFifoIx + 1)%NCCL_MAX_OPS; if (tid == 0) channel->index = workFifoIx; // write back to real channel, not shmem shadow - if (tid < NCCL_MAX_WORK_ELEMENTS && tid % ncclWorkElemFactors[Algo] == 0) { - ncclWorkElem *we = &shmem.work.elems[tid]; - if (we->redOpArgIsPtr && we->active != 0) { - /* redOpArg is a pointer to the scalar value, so we'll dereference it - * here so that redOpArg holds the bits of the scalar going forward. - * The tricky thing is we don't know its type T since that's encoded in - * the funcIndex. Because it would be difficult to get sizeof(T) from - * funcIndex, we'll cheat and just dereference the largest possible size - * given the alignment of the pointer. We might be reading in more bytes - * than we need but that's harmless. - */ - if (we->coll.redOpArg%2 != 0) - we->coll.redOpArg = *reinterpret_cast(we->coll.redOpArg); - else if (we->coll.redOpArg%4 != 0) - we->coll.redOpArg = *reinterpret_cast(we->coll.redOpArg); - else if (we->coll.redOpArg%8 != 0) - we->coll.redOpArg = *reinterpret_cast(we->coll.redOpArg); - else - we->coll.redOpArg = *reinterpret_cast(we->coll.redOpArg); - } + __syncwarp(); + if (ncclShmem->work.header.type == ncclWorkTypeColl) { + if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem->work.elems[tid]); + } else if (ncclShmem->work.header.type == ncclWorkTypeRegColl) { + if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem->work.regElems[tid].elem); } __syncthreads(); - if (shmem.work.elems[0].funcIndex == FnIndex) - RunWork().run(&shmem.work); - else - NCCL_CALL_FUNCTIONS(&elems[0]); - - if (shmem.work.elems[0].active == 2) { - if (COLLTRACE && tid == 0) traceKernelEnd(elems->funcIndex) - break; + if (COLLTRACE && tid == 0) { + traceKernelLaunch(ncclShmem->work.elems[0],firstLaunch); + firstLaunch = false; + #pragma unroll 1 + for(int e=1; e < NCCL_MAX_WORK_ELEMENTS && ncclShmem->work.elems[e].header.type != ncclWorkTypeUnused; e ++) { + traceColl(ncclShmem->work.elems[e], 0); + } } + if (ncclShmem->work.header.funcIndex == FnIndex) + RunWork().run(&ncclShmem->work); + else + NCCL_CALL_FUNCTIONS(ncclShmem->work.header.funcIndex); + + if (ncclShmem->work.header.isLast) break; __syncthreads(); skipLoadWork = false; } + if (COLLTRACE && tid == 0) traceKernelEnd() } #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \ __launch_bounds__(NCCL_MAX_NTHREADS, 1) \ -__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(ncclWorkElem first) { \ - if (first.comm->collTraceThread) \ - ncclKernel, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true>(first); \ +__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, ncclWorkElem first) { \ + if (comm->collTraceThread) \ + ncclKernel, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true>(comm, first); \ else \ - ncclKernel, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false>(first); \ + ncclKernel, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false>(comm, first); \ } // Examples : AllReduce, RING, LL, Sum, uint8 /* Functions for aggregation case */ #define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \ -__device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(struct ncclWorkElem* args) { \ +__device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \ RunWork, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem->work); \ } @@ -574,46 +545,6 @@ __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev IMPL_COLL2(func, PreMulSum) \ IMPL_COLL2A(func, SumPostDiv) -// [RCCL] Define clique-based implementations (repurposed LL128) -#define IMPL_COLL4_CLIQUE(func, algo, devredop, type, ncclType) \ - IMPL_COLL_FUNC(func, algo, LL, devredop, type) \ - IMPL_COLL_FUNC(func, algo, LL128, devredop, type) \ - IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type) \ - -#define IMPL_COLL3_CLIQUE(func, devredop, type, ncclType) \ - IMPL_COLL4_CLIQUE(func, TREE, devredop, type, ncclType) \ - IMPL_COLL4_CLIQUE(func, RING, devredop, type, ncclType) \ - IMPL_COLL4_CLIQUE(func, COLLNET, devredop, type, ncclType) - -#define IMPL_COLL2_CLIQUE(func, devredop) \ - IMPL_COLL3_CLIQUE(func, devredop, int8_t, ncclInt8) \ - IMPL_COLL3_CLIQUE(func, devredop, uint8_t, ncclUint8) \ - IMPL_COLL3_CLIQUE(func, devredop, int32_t, ncclInt32) \ - IMPL_COLL3_CLIQUE(func, devredop, uint32_t, ncclUint32) \ - IMPL_COLL3_CLIQUE(func, devredop, int64_t, ncclInt64) \ - IMPL_COLL3_CLIQUE(func, devredop, uint64_t, ncclUint64) \ - IMPL_COLL3_CLIQUE(func, devredop, half, ncclFloat16) \ - IMPL_COLL3_CLIQUE(func, devredop, float, ncclFloat32) \ - IMPL_COLL3_CLIQUE(func, devredop, double, ncclFloat64) \ - IMPL_COLL3_CLIQUE(func, devredop, rccl_bfloat16, ncclBfloat16) - -#define IMPL_COLL2A_CLIQUE(func, devredop) \ - IMPL_COLL3_CLIQUE(func, devredop, int8_t, ncclInt8) \ - IMPL_COLL3_CLIQUE(func, devredop, uint8_t, ncclUint8) \ - IMPL_COLL3_CLIQUE(func, devredop, int32_t, ncclInt32) \ - IMPL_COLL3_CLIQUE(func, devredop, uint32_t, ncclUint32) \ - IMPL_COLL3_CLIQUE(func, devredop, int64_t, ncclInt64) \ - IMPL_COLL3_CLIQUE(func, devredop, uint64_t, ncclUint64) - -#define IMPL_COLL_CLIQUE(func) \ - IMPL_COLL2_CLIQUE(func, Sum) \ - IMPL_COLL2_CLIQUE(func, Prod) \ - IMPL_COLL2_CLIQUE(func, Min) \ - IMPL_COLL2_CLIQUE(func, Max) \ - IMPL_COLL2_CLIQUE(func, PreMulSum) \ - IMPL_COLL2A_CLIQUE(func, SumPostDiv) -// [/RCCL] - // Copy primitives only define one function for copy #define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8); diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index 0349249e1a..d6fa08f186 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -17,9 +17,15 @@ // Define min for ssize_t static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; } -template -inline __device__ void loadPtr(void** ptr, T* &v) { +inline __device__ int loadInt(int* ptr) { + int v; +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) v = LOAD(ptr); +#else + asm volatile("ld.volatile.global.u32 %0, [%1];" + : "=r"(v) : "l"(ptr)); +#endif + return v; } typedef uint64_t PackType; @@ -485,16 +491,16 @@ struct MULTI128 { inline __device__ void Fetch128(Pack128& v, const Pack128* p) { #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - v.x = p->x; - v.y = p->y; + v.x = __builtin_nontemporal_load(&p->x); + v.y = __builtin_nontemporal_load(&p->y); #else asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory"); #endif } inline __device__ void Store128(Pack128* p, Pack128& v) { #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - p->x = v.x; - p->y = v.y; + __builtin_nontemporal_store(v.x, &p->x); + __builtin_nontemporal_store(v.y, &p->y); #else asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory"); #endif diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu index 54fd993ea3..af1d56fc26 100644 --- a/src/collectives/device/onerank_reduce.cu +++ b/src/collectives/device/onerank_reduce.cu @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -17,11 +17,11 @@ namespace { int tid = threadIdx.x; int tn = blockDim.x; #pragma unroll 1 - for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e++) { + for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) { ncclWorkElem *we = &w->elems[e]; - intptr_t eltN = we->coll.count; - int bid = we->coll.bid; - int bn = we->coll.nChannels; + intptr_t eltN = we->count; + int bid = we->bid; + int bn = we->nChannels; T const *src = (T const*)we->sendbuff; T *dst = (T*)we->recvbuff; @@ -37,13 +37,13 @@ namespace { src += i0; dst += i0; ReduceOrCopyMulti - (tid, tn, &(we->coll.redOpArg), true, 1, &src, 1, &dst, i1-i0); + (tid, tn, &(we->redOpArg), true, 1, &src, 1, &dst, i1-i0); } } } #define INSTANTIATE(devredop, type) \ - __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)(struct ncclWorkElem* args) { \ + __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \ oneRankReduce>(); \ } diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index 627407ddd0..ee8f06a569 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -125,7 +125,7 @@ struct FanSymmetric { }; // The primitives class. Specialized per protocol in the other headers. -template +template class Primitives; // Used by LL & LL128 to implement direct members in the naive way. @@ -157,20 +157,12 @@ struct PrimitivesWithoutDirect { #include "prims_ll128.h" #ifdef ENABLE_PROFILING -#ifdef ENABLE_TIMING_PROFILE #define INIT_COUNTER \ - if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); } + if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; t0 = __builtin_amdgcn_s_memrealtime(); ws = elem->elem[blockIdx.x].wait_cycle; } #define ACCUMULATE_COUNTER(prim) \ - if (tid == 0 && args->coll.opCount) { devProf->elems[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0); \ - devProf->elems[blockIdx.x].prim##_byte += nelem * sizeof(T); } -#else -#define INIT_COUNTER \ - if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); ws = devProf->elems[blockIdx.x].wait_cycle; } -#define ACCUMULATE_COUNTER(prim) \ - if (tid == 0 && args->coll.opCount) { devProf->elems[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \ - + ws - devProf->elems[blockIdx.x].wait_cycle); \ - devProf->elems[blockIdx.x].prim##_byte += nelem * sizeof(T); } -#endif + if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; elem->elem[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \ + + ws - elem->elem[blockIdx.x].wait_cycle); \ + elem->elem[blockIdx.x].prim##_byte += nelem * sizeof(T); elem->elem[blockIdx.x].opCount = args->opCount;} #else #define INIT_COUNTER #define ACCUMULATE_COUNTER(prim) diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h index d6e76d6985..7e0ca7211b 100644 --- a/src/collectives/device/prims_ll.h +++ b/src/collectives/device/prims_ll.h @@ -1,13 +1,13 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -template -class Primitives: - public PrimitivesWithoutDirect> { +template +class Primitives: + public PrimitivesWithoutDirect> { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; @@ -45,7 +45,7 @@ class Primitives: #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) __syncthreads(); #else - asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(1+group)); + asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group)); #endif } @@ -123,7 +123,7 @@ class Primitives: template __device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) { - #pragma unroll 1 + #pragma unroll for (int i=BeginIx; i < MaxRecv; i++) { if (i < fan.nrecv()) { union ncclLLFifoLine* src = recvPtr(i) + offset; @@ -290,14 +290,7 @@ class Primitives: // Always waitSend in case of cleanup nelem = nelem < 0 ? 0 : nelem; -#ifdef ENABLE_PROFILING - uint64_t t0; - if (tid == 0) t0 = __builtin_amdgcn_s_memrealtime(); -#endif if (SEND) waitSend(divUp(nelem, EltPerLine)*sizeof(ncclLLFifoLine)); -#ifdef ENABLE_PROFILING - if (SEND && tid == 0) ncclShmem->comm.devProf->elems[blockIdx.x].wait_cycle = (__builtin_amdgcn_s_memrealtime() - t0); -#endif nelem -= tid*EltPerLine; srcElts += tid*EltPerLine; @@ -324,7 +317,7 @@ class Primitives: } if (RECV) { data = !SRC ? peerData : MULTI()(redOp, peerData, data); - #pragma unroll 1 + #pragma unroll MaxRecv for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) { peerData = readLLFinish(offset, line, i); data = MULTI()(redOp, peerData, data); diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h index 4dc0c21754..972ce9d091 100644 --- a/src/collectives/device/prims_ll128.h +++ b/src/collectives/device/prims_ll128.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,9 +11,9 @@ #define __any_sync(WARP_MASK, needReload) (true) -template -class Primitives: - public PrimitivesWithoutDirect> { +template +class Primitives: + public PrimitivesWithoutDirect> { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; @@ -52,7 +52,11 @@ class Primitives: inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; } inline __device__ void barrier() { - asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(1+group)); +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + __syncthreads(); +#else + asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group)); +#endif } uint32_t abort = 0; diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h index 7160cc44ff..a107a1ce64 100644 --- a/src/collectives/device/prims_simple.h +++ b/src/collectives/device/prims_simple.h @@ -1,14 +1,14 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ template + int SlicePerChunk, int StepPerSlice, int Unroll, int P2p> class Primitives< - T, RedOp, Fan, Direct, ProtoSimple + T, RedOp, Fan, Direct, ProtoSimple, P2p > { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; @@ -19,7 +19,7 @@ class Primitives< RolePostSend = 0x10, RolePostRecv = 0x20, Aborted = 0x40, - PtrsFifoEnabled = 0x80, + OffsFifoEnabled = 0x80, SizesFifoEnabled = 0x100, DirectWrite = 0x200, DirectRead = 0x400, @@ -33,10 +33,10 @@ class Primitives< int flags; int group; uint64_t step; + int *connOffsFifoPtr; // (flags & OffsFifoEnabled) union { - void **connPtrsFifoPtr; // (flags & PtrsFifoEnabled) T *userBuff; // (flags & (RoleInput|RoleOutput)) - T *connEltsFifo; // !(flags & (PtrsFifoEnabled|RoleInput|RoleOutput)) + T *connEltsFifo; // !(flags & (RoleInput|RoleOutput)) }; union { int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled) @@ -59,7 +59,7 @@ class Primitives< if (nthreads == WARP_SIZE) __syncwarp(); else - asm volatile("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads)); + asm volatile("bar.sync %0, %1;" :: "r"(15-group), "r"(nthreads)); #endif flags |= ThreadsSynced; } @@ -70,7 +70,7 @@ class Primitives< if (nworkers == nthreads) barrier(); else - asm volatile("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers)); + asm volatile("bar.sync %0, %1;" :: "r"(8-group), "r"(nworkers)); #endif } @@ -88,9 +88,6 @@ class Primitives< const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send; const bool noRecvWait = DirectRecv && Src && (flags & DirectRead); // no wait when directly reading from remote input const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write -#if defined(ENABLE_PROFILING) && !defined(ENABLE_TIMING_PROFILE) - uint64_t t0 = __builtin_amdgcn_s_memrealtime(); -#endif if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) || ((flags & (Send*RoleWaitSend)) && !noSendWait)) { int spins = 0; @@ -109,8 +106,8 @@ class Primitives< void **ptrs = isSendNotRecv ? (ncclShmem->groups[group].dsts + Dst) : (ncclShmem->groups[group].srcs + Src); - if (flags & PtrsFifoEnabled) - loadPtr(connPtrsFifoPtr + step%NCCL_STEPS, ptrs[index]); + if (flags & OffsFifoEnabled) + ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T); else if (isSendNotRecv && DirectSend) { if (flags & DirectWrite) { ptrs[index] = directBuff + remoteIx + offset; @@ -132,14 +129,6 @@ class Primitives< ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; } step += StepPerSlice; -#if defined(ENABLE_PROFILING) && !defined(ENABLE_TIMING_PROFILE) - if (opCount) { - if (isSendNotRecv) - ncclShmem->comm.devProf->elems[blockIdx.x].wait_send_cycle += (__builtin_amdgcn_s_memrealtime() - t0); - else - ncclShmem->comm.devProf->elems[blockIdx.x].wait_recv_cycle += (__builtin_amdgcn_s_memrealtime() - t0); - } -#endif } } @@ -204,7 +193,10 @@ class Primitives< waitPeer(dstIx, remoteIx, offset, sliceSize); subBarrier(); #ifdef ENABLE_PROFILING - if (tid == 0 && opCount) ncclShmem->comm.devProf->elems[blockIdx.x].wait_cycle += (__builtin_amdgcn_s_memrealtime() - t0); + if (tid == 0) { + struct ncclProfElem *elem = ncclShmem->comm.devProf.elems+opCount%PROFILE_NUM_ITEMS; + elem->elem[blockIdx.x].wait_cycle += (__builtin_amdgcn_s_memrealtime() - t0); + } #endif if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy @@ -262,6 +254,8 @@ class Primitives< } // Scatter/Gather generic op + // skip: my own rank order in the buffer chunks + // shift: peer offset to avoid all ranks sending to or receiving from same peer template __device__ __forceinline__ void ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp) { @@ -287,11 +281,13 @@ class Primitives< for (int j=0; j= 0 && i >= skip) peerOffset += peerElem; const T* src0 = (T*)ncclShmem->groups[group].srcs[0] + peerOffset; int realPeerSize = min(realSize, totalElem-peerOffset); if (realPeerSize > 0 && ncclShmem->groups[group].dsts[i] != nullptr) { ReduceOrCopyMulti(tid, nworkers, ncclShmem->redOpArgs, false, 1, &src0, 1, (T**)ncclShmem->groups[group].dsts+i, realPeerSize); + // Mark for threadfence at the end if (tid == 0) ncclShmem->groups[group].totalSendSize[slice] += realPeerSize; } } @@ -319,6 +315,7 @@ class Primitives< } } barrier(); + // If we indeed send something, threadfence if (Send && (flags & RolePostSend) && ncclShmem->groups[group].totalSendSize[slice] > 0 && index == 0) __threadfence_system(); __syncwarp(); @@ -340,18 +337,18 @@ class Primitives< ncclShmem->groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs() connStepPtr = conn->tail; connStepCache = LOAD(connStepPtr); - flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0; + flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0; if (Direct) { // User buffers have been registered if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { - if (connIndex == 1) { + if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; } } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { - if (connIndex == 1) { + if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { // direct read not allowed in non-register case @@ -360,10 +357,9 @@ class Primitives< } } } - if (flags & PtrsFifoEnabled) - connPtrsFifoPtr = conn->ptrsFifo; - else - connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; + if (flags & OffsFifoEnabled) + connOffsFifoPtr = conn->offsFifo; + connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; } } } @@ -380,11 +376,10 @@ class Primitives< ncclShmem->groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs() connStepPtr = conn->head; connStepCache = LOAD(connStepPtr); - flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0; - if (flags & PtrsFifoEnabled) - connPtrsFifoPtr = conn->ptrsFifo; - else - connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; + flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0; + if (flags & OffsFifoEnabled) + connOffsFifoPtr = conn->offsFifo; + connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; if (conn->sizesFifo != nullptr) { flags |= SizesFifoEnabled; @@ -392,14 +387,14 @@ class Primitives< } else if (Direct) { // User buffers have been registered if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { - if (connIndex == 1) { + if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; } } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { - if (connIndex == 1) { + if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { // direct read not allowed in non-register case @@ -419,7 +414,7 @@ class Primitives< ): tid(tid), stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)), - opCount(ncclShmem->work.elems[0].coll.opCount) { + opCount(ncclShmem->work.elems[0].opCount) { // For send operations, we need an extra warp to overlap the threadfence and the copy this->nthreads = nthreads; @@ -460,7 +455,7 @@ class Primitives< loadRecvConn(&ncclShmem->channel.devPeers[peer], connIndex, e); loadSendConn(&ncclShmem->channel.devPeers[peer], connIndex, e); - setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkRegElem*)e); + setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e); } __device__ ~Primitives() { @@ -477,7 +472,7 @@ class Primitives< barrier(); } - __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkRegElem* e) { + __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) { if (flags & RoleInput) { userBuff = (T*)inputBuf; ncclShmem->redOpArgs[0] = redOpArg; // scaler for local input diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index 0bf2a98f2a..54355926a5 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -13,21 +13,21 @@ namespace { template __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem->channel.ring; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1)); const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); const int nranks = ncclShmem->comm.nRanks; const ssize_t loopSize = nChannels*chunkSize; - const ssize_t size = args->coll.count; + const ssize_t size = args->count; const int rank = ncclShmem->comm.rank; const int prevRank = ring->devUserRanks[nranks-1]; - const int root = args->coll.root; + const int root = args->root; - Primitives, 0, Proto> - prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg, args->coll.connIndex << 16); + Primitives, 0, Proto, 0> + prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16); auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int { int realChunkSize; @@ -36,7 +36,7 @@ namespace { realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) - realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize; + realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize); return realChunkSize; diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index a5ee6aefa5..9639372b79 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -13,9 +13,9 @@ namespace { template __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem->channel.ring; int const *ringRanks = ring->devUserRanks; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1)); @@ -23,10 +23,10 @@ namespace { const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); const int nranks = ncclShmem->comm.nRanks; const ssize_t loopSize = nChannels*chunkSize; - const ssize_t size = args->coll.count; + const ssize_t size = args->count; - Primitives, 0, Proto> - prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg, args->coll.connIndex << 16); + Primitives, 0, Proto, 0> + prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; @@ -35,7 +35,7 @@ namespace { realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) - realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize; + realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize); realChunkSize = int(realChunkSize); diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h index 698bccde53..15be552009 100644 --- a/src/collectives/device/sendrecv.h +++ b/src/collectives/device/sendrecv.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,73 +11,67 @@ template struct RunWork { - __device__ __attribute__((noinline)) void run(ncclWork *work) { - int tid = threadIdx.x; - int group = 0; - const int rank = ncclShmem->comm.rank; - const int nRanks = ncclShmem->comm.nRanks; - using Proto = ProtoSimple<1, 1>; - - for (int s=0; selems[s]; - int nThreadsSegment = args->p2p.nThreads; - if (args->active == 0 || nThreadsSegment == 0) break; - - int nThreadsSplit = nThreadsSegment/2; - int groupRecv = group; - group += Proto::calcGroupWidth(/*send=*/false, nThreadsSplit); - int groupSend = group; - group += Proto::calcGroupWidth(/*send=*/true, nThreadsSegment - nThreadsSplit); - - if (tid < nThreadsSegment) { - // Compute pointers - T const* sendbuff = (const T*)args->sendbuff; - T* recvbuff = (T*)args->recvbuff; - ssize_t const sendCount = args->p2p.sendCount; - ssize_t const recvCount = args->p2p.recvCount; - int const delta = args->p2p.delta; - - if (delta == 0) { - if (sendbuff != recvbuff) { - ReduceOrCopyMulti(tid, nThreadsSegment, nullptr, false, 1, &sendbuff, 1, &recvbuff, sendCount); - } - } - else { - if ((tid < nThreadsSplit) && recvCount >= 0) { - int const peer = (rank - delta + nRanks)%nRanks; - int const t0 = 0; - int const nt = nThreadsSplit; - int const chunkSize = args->p2p.recvChunkSize/sizeof(T); - Primitives, 0, Proto> prims - (tid-t0, nt, &peer, nullptr, nullptr, recvbuff, /*redOpArg(ignored)=*/0, groupRecv | (args->p2p.recvIdx << 16)); - ssize_t offset = 0; - do { - int nelem = roundUp(chunkSize, nt*(sizeof(uint64_t)/sizeof(T))); - nelem = min(chunkSize, recvCount-offset); - prims.directRecv(offset, nelem); - offset += nelem; - } while(offset < recvCount); - } - - if ((tid >= nThreadsSplit) && sendCount >= 0) { - int const peer = (rank + delta)%nRanks; - int const t0 = nThreadsSplit; - int const nt = nThreadsSegment - nThreadsSplit; - int const chunkSize = args->p2p.sendChunkSize/sizeof(T); - Primitives, 0, Proto> prims - (tid-t0, nt, nullptr, &peer, sendbuff, nullptr, /*redOpArg(ignored)=*/0, groupSend | (args->p2p.sendIdx << 16)); - ssize_t offset = 0; - do { - int nelem = roundUp(chunkSize, nt*(sizeof(uint64_t)/sizeof(T))); - nelem = min(chunkSize, sendCount-offset); - prims.directSend(offset, offset, nelem); - offset += nelem; - } while(offset < sendCount); - } - } - break; + __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { + if (args->peer == ncclShmem->comm.rank) { + struct ncclWorkElemP2p* recvArgs = args-1; + if (args->buff != recvArgs->buff) { + ReduceOrCopyMulti(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count); } - tid -= nThreadsSegment; + } else { + using Proto = ProtoSimple<1, 1>; + ssize_t const count = args->count; + int const chunkSize = args->chunkSize/sizeof(T); + int const peer = args->peer; + Primitives, 1, Proto, 1> prims + (tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group); + ssize_t offset = 0; + do { + int nelem = min(chunkSize, count-offset); + prims.directSend(offset, offset, nelem); + offset += nelem; + } while(offset < count); + } + } + + __device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { + if (args->peer != ncclShmem->comm.rank) { + using Proto = ProtoSimple<1, 1>; + ssize_t const count = args->count; + int const chunkSize = args->chunkSize/sizeof(T); + int const peer = args->peer; + Primitives, 1, Proto, 1> prims + (tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group); + ssize_t offset = 0; + do { + int nelem = min(chunkSize, count-offset); + prims.directRecv(offset, nelem); + offset += nelem; + } while(offset < count); + } + } + + __device__ __forceinline__ void run(ncclWork *work) { + struct ncclWorkElemP2p* args = work->p2pElems; + int ngroups = args->ngroups; + int tid = threadIdx.x; + int wid = tid / WARP_SIZE; + // This has to work even for groups of 2.5 warps (which is 8 groups, and means 3 + // warps for send, 2 warps for recv). + // warpStarts were rounded thanks to int division, but for group number we need to round the other way around + // So we mirror wid then mirror again the group. + #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE) + int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS; + args += group; + if (args->header.type == ncclWorkTypeUnused) return; + + tid -= args->warpStart * WARP_SIZE; + int nthreads = args->nWarps * WARP_SIZE; + group |= (args->connIndex<<16); // Used to select connIndex 1 + if (tid >= nthreads || args->peer == -1) return; + if ((group%2) == 0) { + runRecv(tid, nthreads, group, args); + } else { + runSend(tid, nthreads, group, args); } } }; diff --git a/src/collectives/sendrecv_api.cc b/src/collectives/sendrecv_api.cc index 296144dff1..b137d22526 100644 --- a/src/collectives/sendrecv_api.cc +++ b/src/collectives/sendrecv_api.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -14,8 +14,8 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream) { NVTX3_FUNC_RANGE_IN(nccl_domain); - struct ncclInfo info = { ncclFuncSendRecv, "Send", - sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */ + struct ncclInfo info = { ncclFuncSend, "Send", + NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 1, 1 }; ncclResult_t ret; NCCLCHECK(ncclGroupStart()); @@ -29,7 +29,7 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream) { NVTX3_FUNC_RANGE_IN(nccl_domain); - struct ncclInfo info = { ncclFuncSendRecv, "Recv", + struct ncclInfo info = { ncclFuncRecv, "Recv", NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 1, 1 }; ncclResult_t ret; diff --git a/src/debug.cc b/src/debug.cc index 321d8081e3..fbf51524e0 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -168,3 +168,19 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file } pthread_mutex_unlock(&ncclDebugLock); } + +NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0); + +void ncclSetThreadName(pthread_t thread, const char *fmt, ...) { + // pthread_setname_np is nonstandard GNU extension + // needs the following feature test macro +#ifdef _GNU_SOURCE + if (ncclParamSetThreadName() != 1) return; + char threadName[NCCL_THREAD_NAMELEN]; + va_list vargs; + va_start(vargs, fmt); + vsnprintf(threadName, NCCL_THREAD_NAMELEN, fmt, vargs); + va_end(vargs); + pthread_setname_np(thread, threadName); +#endif +} diff --git a/src/enhcompat.cc b/src/enhcompat.cc new file mode 100644 index 0000000000..97f5a3fb26 --- /dev/null +++ b/src/enhcompat.cc @@ -0,0 +1,28 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +/* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */ + +enum cudaError_t { cudaErrorStubLibrary = 34 }; + +extern "C" { + +cudaError_t cudaStreamGetCaptureInfo_v2(...) __attribute__((visibility("hidden"))) __attribute((weak)); +cudaError_t cudaStreamGetCaptureInfo_v2(...) { return cudaErrorStubLibrary; } + +cudaError_t cudaUserObjectCreate(...) __attribute__((visibility("hidden"))) __attribute((weak)); +cudaError_t cudaUserObjectCreate(...) { return cudaErrorStubLibrary; } + +cudaError_t cudaGraphRetainUserObject(...) __attribute__((visibility("hidden"))) __attribute((weak)); +cudaError_t cudaGraphRetainUserObject(...) { return cudaErrorStubLibrary; } + +cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak)); +cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; } + +cudaError_t cudaGetDriverEntryPoint(...) __attribute__((visibility("hidden"))) __attribute((weak)); +cudaError_t cudaGetDriverEntryPoint(...) { return cudaErrorStubLibrary; } + +} diff --git a/src/enqueue.cc b/src/enqueue.cc index 100e05a52e..3b4ccb6ecf 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -68,7 +68,7 @@ NCCL_FUNCS3B(func, Sum), /*PreMulSum*/ \ NCCL_FUNCS3B(func, Sum) /*SumPostDiv*/ -typedef void(*ncclKern_t)(struct ncclWorkElem first); +typedef void(*ncclKern_t)(struct ncclDevComm* comm, struct ncclWorkElem first); // Must be consistent with the ncclFuncSet enum static ncclKern_t const ncclKerns[1] = { NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), @@ -89,6 +89,19 @@ error: return (res != ncclSuccess) ? 0 : max; } +// Set shared memory carveout for the nccl kernels +ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut) { + ncclResult_t res = ncclSuccess; + int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]); + for (int i = 0; i < numNcclKerns; i++) { + CUDACHECKGOTO(hipFuncSetAttribute((const void *)ncclKerns[i], hipFuncAttributePreferredSharedMemoryCarveout, carveOut), res, error); + } + +error: + return res; +} + + /*****************************************************************************/ /* Launch system : synchronization and CUDA kernel launch */ /*****************************************************************************/ @@ -118,21 +131,23 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor } int opIndex = channel->workFifoTail%NCCL_MAX_OPS; struct ncclWork* w = channel->workFifo+opIndex; - struct ncclWorkElem* e = w->elems; - volatile uint8_t* activePtr = (volatile uint8_t*)&e->active; - while (activePtr[0] != 0) sched_yield(); + volatile uint8_t* typePtr = (volatile uint8_t*)&w->header.type; + while (typePtr[0] != ncclWorkTypeUnused) sched_yield(); memset(w, 0, sizeof(struct ncclWork)); // Initialize with work elem if provided - if (base) memcpy(e, base, sizeof(struct ncclWorkElem)); - e->active = 1; + if (base) memcpy(w->elems, base, sizeof(struct ncclWorkElem)); channel->workFifoTail++; channel->workCount++; if (work) *work = w; return ncclSuccess; } +// Finalize channel work FIFO states before launch +// Called during dynamic enqueue static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph) { ncclComm_t comm = eqInfo->comm; + // Do not use comm->myParams in this function unless in non-graph mode + // In graph mode, enqueue is async to capture, myParams can have been changed hipLaunchParams* params = comm->myParams; // Only launch blocks where we have work to do. @@ -147,26 +162,24 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph eqInfo->maxChannels = params->gridDim.x; } - // Set active = 2 for the last operation and add a no-op on empty channels (p2p case). + // Set isLast = 1 for the last operation and add a no-op on empty channels (p2p case). for (int c=0; cmaxChannels; c++) { struct ncclChannel* channel = comm->channels+c; if (channel->workCount == 0) { struct ncclWork* w; NCCLCHECK(getNextOp(channel, &w, NULL)); - struct ncclWorkElem* e = w->elems; - e->comm = comm->devComm; - e->funcIndex = FUNC_INDEX_P2P; - e->p2p.nThreads = 0; + w->header.funcIndex = FUNC_INDEX_P2P; + w->header.type = ncclWorkTypeP2p; + w->header.nWarps = 0; } - channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active = 2; + channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].header.isLast = 1; if (c == 0) { // As we inline the first coll directly, we can free it immediately. // Except P2P or aggregation or registration cases struct ncclWork* work = channel->workFifo+((channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS); - struct ncclWorkElem* elem = work->elems; - if (elem->funcIndex != FUNC_INDEX_P2P && eqInfo->elemList->count() == 1 && elem->regUsed == 0) - elem->active = 0; + if (work->header.type == ncclWorkTypeColl && eqInfo->elemList->count() == 1) + work->header.type = ncclWorkTypeUnused; } if (channel->gdrMemDesc) { @@ -226,6 +239,8 @@ ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) { return ncclSuccess; } +// Check dependency wrt outside streams or previous launches +// Launch kernel in GROUP mode ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) { hipLaunchParams* params = comm->myParams; if (params->gridDim.x == 0) return ncclSuccess; @@ -261,6 +276,7 @@ ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) { return ncclSuccess; } +// Launch kernel in PARALLEL mode ncclResult_t ncclLaunchKernel(ncclComm_t comm) { hipLaunchParams *params = comm->myParams; if (params->gridDim.x == 0) return ncclSuccess; @@ -283,6 +299,7 @@ ncclResult_t ncclLaunchKernel(ncclComm_t comm) { return ncclSuccess; } +// Launch network proxy static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) { // Start the network proxies as soon as the kernel has been launched. We can't // perform any CUDA call between the two or having a cudaFree between the CUDA @@ -302,6 +319,7 @@ static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) { return ncclSuccess; } +// Record done event for current launch ncclResult_t ncclRecordEvents(ncclComm_t comm) { hipLaunchParams *params = comm->myParams; @@ -320,6 +338,7 @@ ncclResult_t ncclRecordEvents(ncclComm_t comm) { return ncclSuccess; } +// Reset parameter space for launch ncclResult_t ncclLaunchReset(ncclComm_t comm) { comm->userStreamSet = false; @@ -333,6 +352,8 @@ ncclResult_t ncclLaunchReset(ncclComm_t comm) { NCCLCHECK(ncclResetQueueInfo(comm->enqueueInfo)); } + // After capturing an op in graph mode or launching the op in non-graph mode + // we can reset myParams for use in next op hipLaunchParams *params = comm->myParams; params->gridDim.x = params->blockDim.x = 0; params->func = NULL; @@ -347,10 +368,10 @@ ncclResult_t ncclLaunchReset(ncclComm_t comm) { /*****************************************************************************/ /* Enqueueing system : computation of kernel and proxy operations parameters */ /*****************************************************************************/ -RCCL_PARAM(SharpThreshold, "SHARP_THRESHOLD", 16384); static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) { - if (info->comm->collNetSupport > 0 && info->nBytes < rcclParamSharpThreshold()) { + if (info->comm->collNetSupport > 0) { + // Translate ncclAvg and PreMulSum ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op; NCCLCHECK(collNetReduceSupport(info->datatype, netOp, collNetTypeSupport)); } else { @@ -359,6 +380,7 @@ static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNet return ncclSuccess; } +// numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency. static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps) { struct ncclComm* comm = info->comm; if (comm->nRanks == 1 || info->coll == ncclFuncAllToAllPivot) { @@ -395,6 +417,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i int nt = comm->maxThreads[info->algorithm][info->protocol]; int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol]; if (info->algorithm == NCCL_ALGO_COLLNET) { + // CollNet channel tuning int ncSwitch = 16; bool flag = true; while (ncSwitch >= 1 && flag) { @@ -405,6 +428,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i ncSwitch /= 2; } } else { + // Ring/Tree channel tuning while (info->nBytes < nc*nt*threadThreshold) { if (nc >= 2) nc--; #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) @@ -419,6 +443,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i #else if (info->protocol == NCCL_PROTO_SIMPLE) { nt += WARP_SIZE; // Extra warp for sync + // More threads or sync warps needed due to split thread model if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE; if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE; } @@ -495,11 +520,10 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { RCCL_PARAM(IntraNetThreshold, "INTRANET_THRESHOLD", 8388608); -static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyArgs* proxyArgs /* output */) { - work->comm = info->comm->devComm; - +static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) { int collNetTypeSupport = 0; - // Check whether algo and proto have been preset + // Check whether algo and proto have been preset (as in aggregation case) + // If so, skip the calculation if (info->nChannels > 0 && info->nThreads > 0) goto comp_next; NCCLCHECK(getCollNetSupport(info, &collNetTypeSupport)); NCCLCHECK(getAlgoInfo(info, collNetTypeSupport, 1)); @@ -509,59 +533,37 @@ comp_next: NCCLCHECK(getPatternInfo(info)); NCCLCHECK(getLoopInfo(info)); - work->coll.opCount = info->comm->collOpCount; + work->opCount = info->opCount; + work->header.type = ncclWorkTypeColl; work->sendbuff = info->sendbuff; work->recvbuff = info->recvbuff; - work->coll.root = info->root; - work->coll.count = info->count; - work->coll.nChannels = info->nChannels; - work->nThreads = info->nThreads; - work->coll.redOpArg = info->opFull.scalarArg; + work->root = info->root; + work->count = info->count; + work->nChannels = info->nChannels; + work->header.nWarps = info->nThreads / info->comm->WarpSize; + work->redOpArg = info->opFull.scalarArg; work->redOpArgIsPtr = info->opFull.scalarArgIsPtr; if (info->comm->nRanks == 1) { // one-rank reduce index - work->funcIndex = FUNC_INDEX_P2P - ncclNumTypes + int(info->datatype); + work->header.funcIndex = FUNC_INDEX_P2P - ncclNumTypes + int(info->datatype); + //work->header.funcIndex = 1 + int(info->datatype); return ncclSuccess; } else if (info->coll == ncclFuncAllToAllPivot) { - work->funcIndex = FUNC_INDEX_ALLTOALL_PIVOT; + work->header.funcIndex = FUNC_INDEX_ALLTOALL_PIVOT; } else { - work->funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol); + work->header.funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol); } - work->coll.connIndex = 0; - proxyArgs->connIndex = 0; + work->connIndex = 0; + proxyOp->connIndex = 0; if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) { if (info->comm->useIntraNet && info->nBytes > rcclParamIntraNetThreshold()) { - work->coll.connIndex = NCCL_CONN_IDX_P2P_NET; - proxyArgs->connIndex = NCCL_CONN_IDX_P2P_NET; + work->connIndex = NCCL_CONN_IDX_P2P_NET; + proxyOp->connIndex = NCCL_CONN_IDX_P2P_NET; } } - { // [RCCL] Check for clique-based kernel support - if (info->comm->cliqueManager->IsSupported(info->coll, - info->count, - info->datatype, - info->op)) - { - info->algorithm = NCCL_ALGO_RING; - info->protocol = NCCL_PROTO_CLIQUE; - // Determine the number of channels to use for clique-kernel - NCCLCHECK(info->comm->cliqueManager->GetNumChannelsToUse(info->coll, - info->count, - info->datatype, - info->op, - info->comm->nChannels, - &work->clique.nChannels)); - work->clique.count = info->count; - work->funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol); - - // Setup pointers to where all the input/output pointers will be - NCCLCHECK(info->comm->cliqueManager->WaitForPointers(work)); - return ncclSuccess; - } - } // [RCCL] - int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS; int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1; int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1; @@ -576,22 +578,22 @@ comp_next: while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2; } // Use lastChunkSize as chunkSize - work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); + work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) { // Optimize chunkSize / nSteps while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*64 && chunkSize > 131072) chunkSize /= 2; while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 65536) chunkSize /= 2; while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2; // Use lastChunkSize as chunkSize - work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); + work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); // Set direct direction for broadcast-gather (read or write) work->direct = (info->nBytes / info->nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ; } else if (info->protocol == NCCL_PROTO_LL) { const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine); const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; - work->coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop); - ALIGN_SIZE(work->coll.lastChunkSize, info->nThreads*sizeof(uint64_t)); - work->coll.lastChunkSize /= ncclTypeSize(info->datatype); + work->lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop); + ALIGN_SIZE(work->lastChunkSize, info->nThreads*sizeof(uint64_t)); + work->lastChunkSize /= ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) { int nNodes = info->comm->nNodes; float ppn = info->comm->nRanks / (float)nNodes; @@ -599,7 +601,7 @@ comp_next: while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2; while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2; // Use lastChunkSize as chunkSize - work->coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype)); + work->lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype)); } // Compute nSteps for proxies @@ -608,29 +610,29 @@ comp_next: if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS; //if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol); int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize))); - proxyArgs->subs[0].nsteps = info->nstepsPerLoop * nLoops * chunkSteps; - proxyArgs->sliceSteps = sliceSteps; - proxyArgs->chunkSteps = chunkSteps; - proxyArgs->chunkSize = chunkSize; - proxyArgs->protocol = info->protocol; - proxyArgs->dtype = info->datatype; - proxyArgs->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet + proxyOp->nsteps = info->nstepsPerLoop * nLoops * chunkSteps; + proxyOp->sliceSteps = sliceSteps; + proxyOp->chunkSteps = chunkSteps; + proxyOp->chunkSize = chunkSize; + proxyOp->protocol = info->protocol; + proxyOp->dtype = info->datatype; + proxyOp->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum info->op; - proxyArgs->pattern = info->pattern; - proxyArgs->root = info->root; + proxyOp->pattern = info->pattern; + proxyOp->root = info->root; // This is used by P2P to reduce the receive buffer size. We don't use it in collectives // because some protocols need to transmit more than the total size, plus they sometimes // round up - proxyArgs->subs[0].recvbytes = stepSize*proxyArgs->sliceSteps; + proxyOp->nbytes = stepSize*proxyOp->sliceSteps; TRACE(NCCL_COLL,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d chunksize %d comm %p", - proxyArgs->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads, - nLoops, proxyArgs->subs[0].nsteps, chunkSize, info->comm); + proxyOp->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads, + nLoops, proxyOp->nsteps, chunkSize, info->comm); // For Pivot A2A, lastChunkSize is not needed, set pivotA2ANumBiRings instead if (info->coll == ncclFuncAllToAllPivot) { - work->coll.pivotA2ANumBiRings = info->comm->topo->pivotA2ANumBiRings; + work->pivotA2ANumBiRings = info->comm->topo->pivotA2ANumBiRings; } return ncclSuccess; @@ -647,6 +649,7 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) { return ncclSuccess; } +// Handle structure for user buffer registration (IPC) exchange struct ncclBuffRegHandle { hipIpcMemHandle_t sendBuffIpc; hipIpcMemHandle_t recvBuffIpc; @@ -661,37 +664,48 @@ static ncclResult_t ncclRegBuffAndExchange(struct ncclInfo* info, struct ncclBuf if (comm->localRanks == 1) return ncclSuccess; if (comm->pfnCuMemGetAddressRange == NULL) return ncclSuccess; // CUDA toolkit or driver version too old - struct ncclBuffRegHandle regHandles[NCCL_MAX_INTRA_RANKS]; + ncclResult_t ret = ncclSuccess; + struct ncclBuffRegHandle regHandles[NCCL_MAX_LOCAL_RANKS]; // Get IPC handles // Note: the handle only corresponds to the base address of the allocation - CUDACHECK(hipIpcGetMemHandle(®Handles[comm->intraNodeRank].sendBuffIpc, (void*)info->sendbuff)); - CUDACHECK(hipIpcGetMemHandle(®Handles[comm->intraNodeRank].recvBuffIpc, (void*)info->recvbuff)); + CUDACHECKGOTO(hipIpcGetMemHandle(®Handles[comm->localRank].sendBuffIpc, (void*)info->sendbuff), ret, reg_fallback); + CUDACHECKGOTO(hipIpcGetMemHandle(®Handles[comm->localRank].recvBuffIpc, (void*)info->recvbuff), ret, reg_fallback); // Get offset of user buffer within allocation void* baseAddr; size_t size; + // Get base address CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->sendbuff)); - regHandles[comm->intraNodeRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr; + regHandles[comm->localRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr; CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->recvbuff)); - regHandles[comm->intraNodeRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr; - TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->intraNodeRank].recvBuffOffset); + regHandles[comm->localRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr; + TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->localRank].recvBuffOffset); // Exchange handles within node - NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle))); + NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle))); // Open handles at local process for (int i=0; ilocalRanks; i++) { - if (i == comm->intraNodeRank) { + // Skip myself + if (i == comm->localRank) { regInfo->sendbuffsBase[i] = regInfo->recvbuffsBase[i] = NULL; continue; } + // Get base address of mapping CUDACHECK(hipIpcOpenMemHandle(regInfo->sendbuffsBase+i, regHandles[i].sendBuffIpc, hipIpcMemLazyEnablePeerAccess)); CUDACHECK(hipIpcOpenMemHandle(regInfo->recvbuffsBase+i, regHandles[i].recvBuffIpc, hipIpcMemLazyEnablePeerAccess)); - // Get real address of buffer + // Get real buffer address by adding offset in the mapping regInfo->sendbuffs[i] = (char*)regInfo->sendbuffsBase[i] + regHandles[i].sendBuffOffset; regInfo->recvbuffs[i] = (char*)regInfo->recvbuffsBase[i] + regHandles[i].recvBuffOffset; } + // Marks the operation as being buffer registered regInfo->nBuffs = comm->localRanks; TRACE(NCCL_COLL, "Rank %d exchanged %d buffers", comm->rank, regInfo->nBuffs); return ncclSuccess; + +reg_fallback: + // If we cannot register specific buffer types, we just bypass this stage, and continue without failing + (void)ret; + WARN("Unable to register user buffers"); + return ncclSuccess; } // Compute enqueue element, save it in list @@ -710,9 +724,8 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) { // Compute cuda kernel arg and proxy arg templates struct ncclQueueElem* eqElem; NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem)); - struct ncclWorkElem* work = &eqElem->work; - eqElem->proxyArgs.nsubs = 1; - NCCLCHECK(computeColl(info, work, &eqElem->proxyArgs)); + struct ncclWork* work = &eqElem->work; + NCCLCHECK(computeColl(info, work->elems, &eqElem->proxyOp)); // Determine grid size hipLaunchParams* params = comm->myParams; @@ -724,9 +737,13 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) { // Inline the first kernel if (params->func == NULL) { params->func = (void *)ncclKerns[0]; - memcpy(&comm->args, work, sizeof(struct ncclWorkElem)); - comm->args.coll.bid = 0; // Only inline for channel 0 - comm->args.active = 2; // I am so far the last element; may be changed later in aggregation mode + if (work->header.type == ncclWorkTypeColl) { + // Copy the first operation to the inline argument. Type may be set later to + // ncclWorkTypeUnused if we have more than one coll element. + memcpy(&comm->args, work->elems, sizeof(struct ncclWorkElem)); + comm->args.bid = 0; // Only inline for channel 0 + comm->args.header.isLast = 1; // I am so far the last element + } } // Register and exchange input and output buffers @@ -736,15 +753,17 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) { comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other comm->intraRanks == 1) { // only in multi-process mode NCCLCHECK(ncclRegBuffAndExchange(info, &eqElem->buffRegInfo)); - // Disable inline argument because we need kernel to copy the entire ncclWork from workFifo - // because the registered addresses are in ncclWork - if (eqElem->buffRegInfo.nBuffs > 0) comm->args.active = 0; comm->enqueueInfo->nRegBuffs += eqElem->buffRegInfo.nBuffs; + work->header.type = ncclWorkTypeRegColl; + // Disable inline argument because we need kernel to copy the entire ncclWork from workFifo + // because the registered addresses are in ncclWorkElemReg + comm->args.header.type = ncclWorkTypeUnused; } return ncclSuccess; } +// Find the channel with the least enqueued work (counted in bytes) static inline int findShortestChannel(ncclComm_t comm) { size_t minSize = SIZE_MAX; int minC = 0; @@ -758,6 +777,7 @@ static inline int findShortestChannel(ncclComm_t comm) { return minC; } +// Get next channel based on shortest-queue mode or round-robin mode static inline int getNextChannel(ncclComm_t comm, int aggMode) { int nextChannel = 0; if (aggMode && comm->asyncAllocMode == ncclComm::SHORTEST_QUEUE) { @@ -769,6 +789,8 @@ static inline int getNextChannel(ncclComm_t comm, int aggMode) { return nextChannel; } +// Setup aggregated kernels +// Op info has been previously saved in comm->asyncOps ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { if (comm->asyncOpCount == 0) { return ncclSuccess; @@ -779,16 +801,22 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { NCCLCHECK(ncclSetupCollKernel(info)); } else { // Aggregation + // Determine a per-channel chunk size used to divide an operation into multiple channels size_t channelSize; if (comm->channelSize > 0) { + // Set by user channelSize = comm->channelSize; } else if (comm->collNetSupport && comm->asyncOps[0].coll == ncclFuncAllReduce) { + // CollNet specific size (tuned based on experiments) channelSize = 256 * 1024; } else { - channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks); // scale channel size based on nranks as latency increases + // Latency increases as scale increases + // We would thus want to increase the chunk size to compensate for the lost efficiency + channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks); } // Reduce the per-channel size if we cannot fully utilize the channels while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2; + // Check whether the ops have same reduce and data types (and hence can be packed in same ncclWork) int channelUsed = 0; int homogeneous = 1; int allCollNetSupport = comm->collNetSupport; @@ -801,6 +829,8 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { info->nChannels = std::min(std::max(1, (int)DIVUP(info->nBytes, channelSize)), comm->nChannels); // assign number of channels } channelUsed += info->nChannels; + //printf("asyncOpCount %d nChannels %d used %d info->nBytes %ld channelSize %ld comm->nChannels %d\n", + //c, info->nChannels, channelUsed, info->nBytes, channelSize, comm->nChannels); // We can use fast path if all collectives are the same homogeneous &= info->coll == comm->asyncOps[0].coll && info->opFull.op == comm->asyncOps[0].opFull.op && @@ -808,6 +838,7 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { if (allCollNetSupport > 0) NCCLCHECK(getCollNetSupport(info, &allCollNetSupport)); } // Compute algo, proto, nthreads for the entire kernel + // Prepare a synthetic op info to calculate the collective algo struct ncclInfo total; total.comm = comm; total.coll = comm->asyncOps[0].coll; @@ -815,16 +846,18 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { total.nChannels = std::min(channelUsed, comm->nChannels); int perChannelOps = DIVUP(channelUsed, total.nChannels); if (homogeneous) NCCLCHECK(getAlgoInfo(&total, allCollNetSupport, perChannelOps)); + // Set for each op for (int c = 0; c < comm->asyncOpCount; c++) { struct ncclInfo* info = comm->asyncOps+c; if (homogeneous) { + // Set fields to skip the individual computeColl in ncclSetupCollKernel info->algorithm = total.algorithm; info->protocol = total.protocol; info->nThreads = total.nThreads; } NCCLCHECK(ncclSetupCollKernel(info)); } - comm->args.active = 0; // disable inline argument + comm->args.header.type = ncclWorkTypeUnused; // disable inline argument } // Reset counters comm->asyncOpCount = 0; @@ -832,6 +865,7 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { return ncclSuccess; } +// Store aggregated operations info static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) { ncclComm_t comm = info->comm; if (comm->asyncOpCount >= NCCL_MAX_OPS) { @@ -850,14 +884,19 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) { struct ncclComm* comm = info->comm; int peer = info->root; ssize_t nBytes = info->count*ncclTypeSize(info->datatype); - if (info->opName[0] == 'S') { // Send + int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2; + int peerNode = comm->rankToNode[peer]; + int peerIndex = comm->rankToLocalRank[peer]; + int nsteps = comm->maxLocalRanks; + int rankIndex = comm->rankToLocalRank[comm->rank]; + if (info->coll == ncclFuncSend) { if (peer != comm->rank) { int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks; for (int c=0; cp2pnChannelsPerPeer; c++) { int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; - if (comm->channels[channelId].peers[peer].send[0].connected == 0) { - comm->connectSend[peer] |= (1<connect[0] = 1; + if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector + comm->connectSend[peer+comm->nRanks*1] |= (1<connect[1] = 1; } if (comm->p2pNet && comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P_NET].connected == 0) { comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1<p2pSends[info->root], (void*)info->sendbuff, nBytes)); + NCCLCHECK(ncclSaveP2pInfo(comm->p2pSends[info->root], info->recvbuff, nBytes, info->opCount)); comm->p2pSendCount++; } else { if (peer != comm->rank) { int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks; for (int c=0; cp2pnChannelsPerPeer; c++) { int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; - if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { - comm->connectRecv[peer] |= (1<connect[0] = 1; + if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector + comm->connectRecv[peer+comm->nRanks*1] |= (1<connect[1] = 1; } if (comm->p2pNet && comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P_NET].connected == 0) { comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1<p2pRecvs[info->root], info->recvbuff, nBytes)); + NCCLCHECK(ncclSaveP2pInfo(comm->p2pRecvs[info->root], info->recvbuff, nBytes, info->opCount)); comm->p2pRecvCount++; } return ncclSuccess; } -enum { RingTree_Segment=0, P2P_Segment=1, CollNet_Segment=2 }; -static int getSegment(int type, int delta, struct ncclWork* work) { - // Current ncclWork is full - if (work->elems[NCCL_MAX_WORK_ELEMENTS-1].active != 0) return -1; +static int getSegment(enum ncclWorkElemType type, enum ncclWorkElemSubType subType, int peer, struct ncclWork* work, struct ncclComm* comm) { + if (work->header.type && (work->header.type != type)) return -1; - if (type == P2P_Segment) { // P2P - // Do not mix P2P and collective ops - if (work->elems[0].funcIndex != FUNC_INDEX_P2P) return -1; - for (int s=0; selems[s].p2p.delta != delta; s++) { - if (work->elems[s].active == 0) return s; + if (type == ncclWorkTypeP2p) { // P2P + int start = subType == ncclWorkSubTypeRecv ? 0 : 1; + for (int s=start; sWarpSize; s+=2) { + if (work->p2pElems[s].peer == -1) return s; + // Do not aggregate multiple sends to the same peer (or receives from the same peer) + if (work->p2pElems[s].peer == peer) return -1; } - } else if (type == CollNet_Segment) { // CollNet - for (int s=0; selems[s].active == 0) return s; + } else if (type == ncclWorkTypeRegColl) { // CollNet + for (int s=0; sregElems[s].elem.header.type == ncclWorkTypeUnused) return s; } - } else { // Ring or Tree + } else if (type == ncclWorkTypeColl) { // Ring or Tree for (int s=0; selems[s].active == 0) return s; + if (work->elems[s].header.type == ncclWorkTypeUnused) return s; } } return -1; } -static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElem* elem /* output */) { - elem->comm = info->comm->devComm; - elem->funcIndex = FUNC_INDEX_P2P; - elem->nThreads = NCCL_MAX_NTHREADS; - elem->sendbuff = info->sendbuff; - elem->recvbuff = info->recvbuff; - elem->p2p.opCount = info->comm->p2pOpCount; - elem->p2p.sendCount = info->sendbytes; - elem->p2p.recvCount = info->recvbytes; - elem->p2p.sendChunkSize = info->sendChunkSize; - elem->p2p.recvChunkSize = info->recvChunkSize; - elem->p2p.delta = info->delta; +// Compute kernel arguments for P2P ops +static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElemP2p* elem /* output */) { + elem->header.type = ncclWorkTypeP2p; + elem->header.funcIndex = FUNC_INDEX_P2P; + elem->header.nWarps = NCCL_MAX_NTHREADS/info->comm->WarpSize; + elem->buff = info->recvbuff; + elem->subType = info->coll == ncclFuncSend ? ncclWorkSubTypeSend : ncclWorkSubTypeRecv; + elem->count = info->count; + elem->chunkSize = info->chunkSize; + elem->peer = info->root; + elem->opCount = info->opCount; + elem->connIndex = info->connIndex; return ncclSuccess; } -static ncclResult_t enqueueSegOp(int type, struct ncclWorkElem* elem /* input */, struct ncclWork* work, int s, +// Equeue work elements into segment of ncclWork +// Supporting both collectives (aggregated or not) and P2P +static ncclResult_t enqueueSegOp(enum ncclWorkElemType type, struct ncclWork* elem /* input */, struct ncclWork* work, int s, struct ncclBuffRegInfo* regInfo, struct ncclChannel* channel, struct ncclComm* comm) { - // Copy element into corresponding segment of ncclWork - memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem)); - work->elems[s].active = 1; - // Determine nThreads at dynamic time - if (type == P2P_Segment) { - const int nsegments = s+1; - int nThreads = 512; - while (nsegments*nThreads > 256) nThreads /= 2; - //if (nThreads >= 128) nThreads += WARP_SIZE; - for (int i=0; ielems[i].p2p.nThreads = nThreads; + if (type == ncclWorkTypeP2p) { + memcpy(work->p2pElems+s, elem, sizeof(struct ncclWorkElemP2p)); + if(s) work->header.funcIndex = FUNC_INDEX_P2P; + int nelems = 0; + for (int i=0; iWarpSize; i++) { + if (work->p2pElems[i].header.type) nelems = i+1; + } + + int ngroups = 1; + while (ngroups < nelems) ngroups *= 2; + int nWarps = 1; + while (nWarps*ngroups <= elem->header.nWarps/2) nWarps *= 2; + + for (int i=0; ip2pElems[i].ngroups = ngroups; + work->p2pElems[i].warpStart = + i*(NCCL_MAX_NTHREADS/comm->WarpSize)/ngroups; +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + work->p2pElems[i].nWarps = nWarps; +#else + int extraWarp = nWarps >= 2 ? i%2 : 0; + work->p2pElems[i].nWarps = nWarps + extraWarp; +#endif + } + return ncclSuccess; } + memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem)); + + if (regInfo->nBuffs == 0) return ncclSuccess; + // Copy registered buffer addresses into ncclWork - if (regInfo->nBuffs > 0) { - struct ncclWorkRegElem* regElem = (struct ncclWorkRegElem*)(work->elems+s); - // For CollNet - for (int i=0; icollTree.down[i]; - if (peer == -1) break; - int j = comm->rankToIntraNodeRank[peer]; - if (j < 0) { - WARN("Invalid intra-node rank %d for peer %d", j, peer); - return ncclInternalError; - } - regElem->dnInputs[i] = regInfo->sendbuffs[j]; - regElem->dnOutputs[i] = regInfo->recvbuffs[j]; + struct ncclWorkElemReg* regElem = (struct ncclWorkElemReg*)(work->elems+s); + // For CollNet + for (int i=0; icollTree.down[i]; + if (peer == -1) break; + // Get intra-node slot + int j = comm->rankToLocalRank[peer]; + if (j < 0) { + WARN("Invalid intra-node rank %d for peer %d", j, peer); + return ncclInternalError; } - for (int i=0; icollTree.up[i]; - if (peer == -1) break; - int j = comm->rankToIntraNodeRank[peer]; - if (j < 0) { - WARN("Invalid intra-node rank %d for peer %d", j, peer); - return ncclInternalError; - } - regElem->upOutputs[i] = regInfo->recvbuffs[j]; - } - work->elems[s].regUsed = 1; + // Input buffer of leaf peer + regElem->dnInputs[i] = regInfo->sendbuffs[j]; + // Output buffer of leaf peer + regElem->dnOutputs[i] = regInfo->recvbuffs[j]; } + for (int i=0; icollTree.up[i]; + if (peer == -1) break; + int j = comm->rankToLocalRank[peer]; + if (j < 0) { + WARN("Invalid intra-node rank %d for peer %d", j, peer); + return ncclInternalError; + } + // Output buffer of root peer + regElem->upOutputs[i] = regInfo->recvbuffs[j]; + } + work->elems[s].regUsed = 1; return ncclSuccess; } +// Enqueue P2P op ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem) { - struct ncclWorkElem* workElem = &eqElem->work; - struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs; + struct ncclWorkElemP2p* workElem = eqElem->work.p2pElems; + struct ncclProxyOp* proxyOp = &eqElem->proxyOp; // Try to reuse last p2p operation if not full yet - struct ncclChannel* channel = proxyArgs->subs[0].channel; + struct ncclChannel* channel = comm->channels+proxyOp->channelId; int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS; struct ncclWork* w = channel->workFifo+opIndex; int segment = -1; if (channel->workCount) { // Try to pack more segments into a single operation - segment = getSegment(P2P_Segment, workElem->p2p.delta, w); + segment = getSegment(ncclWorkTypeP2p, workElem->subType, workElem->peer, w, comm); } if (segment == -1) { NCCLCHECK(getNextOp(channel, &w, NULL)); - segment = 0; + segment = workElem->subType == ncclWorkSubTypeRecv ? 0 : 1; + // Initialize work as P2P, set peer=-1 to designate the p2p elem is not used. + w->header.type = ncclWorkTypeP2p; + for (int i=0; iWarpSize; i++) w->p2pElems[i].peer = -1; } + //printf("%s to %d -> Channel %d OpCount %ld Segment %d\n", workElem->subType == ncclWorkSubTypeRecv ? "Recv" : "Send", proxyOp->root, channel->id, channel->workFifoTail-1, segment); // store work element into FIFO - NCCLCHECK(ncclProxySaveP2p(comm, proxyArgs)); - NCCLCHECK(enqueueSegOp(P2P_Segment, workElem, w, segment, &eqElem->buffRegInfo, channel, comm)); - comm->p2pOpCount++; + NCCLCHECK(ncclProxySaveP2p(comm, proxyOp)); + NCCLCHECK(enqueueSegOp(ncclWorkTypeP2p, &eqElem->work, w, segment, &eqElem->buffRegInfo, channel, comm)); return ncclSuccess; } +// Setup P2P op ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) { ncclComm* comm = info->comm; // Compute cuda kernel arg and proxy arg templates struct ncclQueueElem* eqElem; NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem)); // The proxy code will set and tune the send/recv chunk size, make sure to run it first. - NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyArgs)); - NCCLCHECK(computeP2pWorkElem(info, &eqElem->work)); - - eqElem->proxyArgs.sendIdx = info->sendIdx; - eqElem->proxyArgs.recvIdx = info->recvIdx; - eqElem->work.p2p.sendIdx = info->sendIdx; - eqElem->work.p2p.recvIdx = info->recvIdx; - + NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyOp)); + NCCLCHECK(computeP2pWorkElem(info, eqElem->work.p2pElems)); + // Compute grid size int channelId = info->channelId; hipLaunchParams* params = comm->myParams; params->gridDim.x = std::max(params->gridDim.x, channelId+1); - params->blockDim.x = std::max(params->blockDim.x, eqElem->work.nThreads); + params->blockDim.x = std::max(params->blockDim.x, eqElem->work.header.nWarps*info->comm->WarpSize); comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here // Record the first kernel to launch @@ -1021,8 +1082,8 @@ ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) { // The CUDA kernel does not use the inlined first work element as fastpath argument if (params->func == NULL) { params->func = (void *)ncclKerns[0]; - comm->args.comm = eqElem->work.comm; - comm->args.active = 0; + //params->func = ncclKerns[eqElem->work.header.funcIndex]; + comm->args.header.type = ncclWorkTypeUnused; } return ncclSuccess; } @@ -1030,24 +1091,24 @@ ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) { // Dynamic enqueue function for collective kernels // Supports both aggregated and non-aggregated modes ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem, int aggMode) { - struct ncclWorkElem* work = &eqElem->work; - struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs; + struct ncclWork* work = &eqElem->work; + struct ncclWorkElem* elem = work->elems; + struct ncclProxyOp* proxyOp = &eqElem->proxyOp; - int nChannels = work->coll.nChannels; - size_t channelSize = work->coll.count*ncclTypeSize(proxyArgs->dtype)/work->coll.nChannels; - int segmentType = proxyArgs->redOp == ncclNumOps ? RingTree_Segment : CollNet_Segment; // redOp is only set when using CollNet + int nChannels = elem->nChannels; + size_t channelSize = elem->count*ncclTypeSize(proxyOp->dtype)/elem->nChannels; + enum ncclWorkElemType workElemType = proxyOp->redOp == ncclNumOps ? ncclWorkTypeColl : ncclWorkTypeRegColl; // redOp is only set when using CollNet for (int bid=0; bidchannels+channelId; // Proxy - proxyArgs->subs[0].channel = channel; - proxyArgs->opCount = comm->collOpCount; - proxyArgs->commOpCount = comm->opCount; - if (proxyArgs->subs[0].nsteps) NCCLCHECK(ncclProxySaveColl(proxyArgs, comm->nRanks)); + proxyOp->channelId = channelId; + proxyOp->opCount = comm->collOpCount; + if (proxyOp->nsteps) NCCLCHECK(ncclProxySaveColl(comm, proxyOp, comm->nRanks)); - work->coll.bid = bid % nChannels; + elem->bid = bid % nChannels; struct ncclWork* w = NULL; int segment = -1; if (aggMode && channel->workCount) { @@ -1056,9 +1117,9 @@ ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* w = channel->workFifo+opIndex; // All elems in work must have same (funcIndex,nThreads), // see "src/collectives/device/common.h" - if (w->elems[0].funcIndex == work->funcIndex && - w->elems[0].nThreads == work->nThreads) { - segment = getSegment(segmentType, 0, w); + if (w->header.funcIndex == work->header.funcIndex && + w->header.nWarps == work->header.nWarps) { + segment = getSegment(workElemType, ncclWorkSubTypeUnused, 0, w, comm); } } if (segment == -1) { @@ -1067,16 +1128,20 @@ ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* } // store work element into FIFO - NCCLCHECK(enqueueSegOp(segmentType, work, w, segment, &eqElem->buffRegInfo, channel, comm)); + NCCLCHECK(enqueueSegOp(workElemType, work, w, segment, &eqElem->buffRegInfo, channel, comm)); channel->totalSize += channelSize; } comm->collOpCount++; return ncclSuccess; } +// Host setup node for CUDA Graph +// Performs the enqueue job template void HIPRT_CB ncclEnqueueHostSetup(void* arg) { + NVTX3_FUNC_RANGE_IN(nccl_domain); ncclResult_t ret; + // All work for current launch has been captured in Queue Info struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)arg; ncclComm_t comm = eqInfo->comm; int aggMode = eqInfo->elemList->count() > 1 ? 1 : 0; @@ -1084,7 +1149,7 @@ void HIPRT_CB ncclEnqueueHostSetup(void* arg) { // Iterate through the element list struct ncclQueueElem* eqElem = eqInfo->elemList->begin(); while (eqElem != NULL) { - if (eqElem->work.funcIndex == FUNC_INDEX_P2P) { + if (eqElem->work.header.funcIndex == FUNC_INDEX_P2P) { NCCLCHECKGOTO(ncclEnqueueP2pKernel(comm, eqElem), ret, cb_end); } else { NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem, aggMode), ret, cb_end); @@ -1105,6 +1170,8 @@ cb_end: template void HIPRT_CB ncclEnqueueHostSetup<0>(void*); template void HIPRT_CB ncclEnqueueHostSetup<1>(void*); +// CUDA Graph helper thread +// for de-registering user buffers void* graphHelperFunc(void *args) { struct ncclGraphHelperResources* res = (struct ncclGraphHelperResources*)args; if (res == NULL) { @@ -1118,8 +1185,10 @@ void* graphHelperFunc(void *args) { volatile enum helperThreadState* state = &res->threadState; volatile int* ipcTail = &res->ipcTail; while (1) { + // Last IPC entry enqueue so far int ipcTailMark = *ipcTail; int ipcCount = 0; + // Close IPC till the last entry while (res->ipcHead != ipcTailMark) { if (res->ipcBases[res->ipcHead] != NULL) CUDACHECKIGNORE(hipIpcCloseMemHandle(res->ipcBases[res->ipcHead])); @@ -1129,6 +1198,7 @@ void* graphHelperFunc(void *args) { } TRACE(NCCL_COLL, "CUDA Graph helper thread closed %d IPC handles", ipcCount); pthread_mutex_lock(&res->threadLock); + // Check for exit signal while (res->ipcHead == *ipcTail && *state != ThreadStop) { pthread_cond_wait(&res->threadCond, &res->threadLock); } @@ -1140,23 +1210,24 @@ void* graphHelperFunc(void *args) { } } +// Check if we are in CUDA Graph capture mode ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) { comm->usingCudaGraph = 0; + // Feature requires CUDA 11.3/R465 or above #if CUDART_VERSION >= 11030 - hipStreamCaptureStatus captureStatus; - unsigned long long hipGraphId; + cudaStreamCaptureStatus captureStatus; + unsigned long long cudaGraphId; + ncclResult_t ret = ncclSuccess; if (comm->driverVersion < 11030) { - CUDACHECK(hipStreamIsCapturing(comm->userStream, &captureStatus)); - if (captureStatus != hipStreamCaptureStatusNone) { - WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support"); - return ncclInvalidUsage; - } - return ncclSuccess; + // Runtime driver version older than compiler version + // Enhanced compat fallback + goto enh_compat_end; } - CUDACHECK(hipStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &hipGraphId, graph, NULL, NULL)); - if (captureStatus == hipStreamCaptureStatusActive) { - if (hipGraphId != comm->lastCudaGraphId) { - INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", hipGraphId); + // Get CUDA Graph handle + CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end); + if (captureStatus == cudaStreamCaptureStatusActive) { + if (cudaGraphId != comm->lastCudaGraphId) { + INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId); // We are in a new graph, hence need to forget the last setup node so that // the first setup node in the new graph will not have a dependency comm->lastCudaGraphId = hipGraphId; @@ -1169,15 +1240,31 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph) { // Only create this thread when buffer registration is enabled if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) { pthread_mutex_init(&comm->graphHelperResources->threadLock, NULL); + // Init signaling method between Graph destroy function and helper thread pthread_cond_init(&comm->graphHelperResources->threadCond, NULL); + // Set state comm->graphHelperResources->threadState = ThreadStart; + // Create thread pthread_create(&comm->graphHelperThread, NULL, graphHelperFunc, comm->graphHelperResources); + // Name thread + ncclSetThreadName(comm->graphHelperThread, "NCCL GrHelper%2d", comm->cudaDev); } } + return ncclSuccess; + +enh_compat_end: // Enhanced compat fallback + (void)ret; + CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus)); + if (captureStatus != cudaStreamCaptureStatusNone) { + WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support"); + return ncclInvalidUsage; + } + // If we are not in capture mode, we can ignore the driver being lower #endif return ncclSuccess; } +// Create host setup node in CUDA Graph ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) { #if CUDART_VERSION >= 11030 struct ncclQueueInfo* eqInfo = comm->enqueueInfo; @@ -1185,14 +1272,17 @@ ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph) { // which CUDA graph would manage lifetime of hipUserObject_t object; CUDACHECK(hipUserObjectCreate(&object, eqInfo, ncclDestroyQueueInfo, 1/*initialRefcount*/, hipUserObjectNoDestructorSync)); + // Hand over ownership to CUDA Graph CUDACHECK(hipGraphRetainUserObject(graph, object, 1, hipGraphUserObjectMove)); hipHostFn_t fn = ncclEnqueueHostSetup<1>; // Add a CPU node to the graph hipGraphNode_t setupNode; + // Function + parameter space for that function (i.e. enqueue info) hipHostNodeParams setupNodeParams = {fn, eqInfo}; int numDependencies = comm->lastSetupNode == NULL ? 0 : 1; CUDACHECK(hipGraphAddHostNode(&setupNode, graph, &comm->lastSetupNode, numDependencies, &setupNodeParams)); + // Create dependency from last setup node in the same graph CUDACHECK(hipStreamUpdateCaptureDependencies(comm->userStream, &setupNode, 1, hipStreamAddCaptureDependencies)); comm->lastSetupNode = setupNode; return ncclSuccess; @@ -1271,20 +1361,6 @@ static ncclResult_t hostToDevRedOp( } ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { - // [RCCL] Check for clique-based kernel support - { - if (info->comm->cliqueManager->IsSupported(info->coll, - info->count, - info->datatype, - info->op)) - { - // Declare the input / output pointers being used (to exchange via IPC with other ranks) - // This is done immediately, and does not block - NCCLCHECK(info->comm->cliqueManager->DeclarePointers(info->sendbuff, info->recvbuff)); - } - } - // [/RCCL] - ncclResult_t ret = ncclSuccess; bool isAsync = ncclAsyncMode(); int savedDev = -1; @@ -1300,6 +1376,12 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { // op handle may be destroyed before ncclGroupEnd(). NCCLCHECKGOTO(hostToDevRedOp(&info->opFull, info->op, info->datatype, info->comm), ret, end); + // Update opCount + if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) + info->opCount = info->comm->p2pOpCount++; + else + info->opCount = info->comm->collOpCount; + // Launch asynchronously if needed if (isAsync) { // Always register comm even in case of error to make sure ncclGroupEnd @@ -1308,10 +1390,10 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { NCCLCHECKGOTO(checkSetStream(info), ret, end); INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p devRedOp %d isPtr %d scaler %lx", - info->opName, info->coll == ncclFuncSendRecv ? info->comm->p2pOpCount : info->comm->collOpCount, info->sendbuff, info->recvbuff, info->count, + info->opName, info->opCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream, info->opFull.op, info->opFull.scalarArgIsPtr, info->opFull.scalarArg); - if (info->coll == ncclFuncSendRecv) { //p2p stored separately + if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { //p2p stored separately NCCLCHECKGOTO(ncclSaveP2p(info), ret, end); } else { NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end); @@ -1320,7 +1402,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { NCCLCHECKGOTO(checkSetStream(info), ret, end); INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", - info->opName, info->comm->collOpCount, info->sendbuff, info->recvbuff, info->count, + info->opName, info->opCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); // Check whether we are in cuda graph mode diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 3ac7d74164..0c3ba56629 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,6 +9,7 @@ #include "graph.h" #include "trees.h" #include "rings.h" +#include "topo.h" /******************************************************************/ /********************* Internode connection ***********************/ @@ -18,7 +19,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoRanks* topoRanks) { int rank = comm->rank; - int localRanks = comm->localRanks; + int localRanks = comm->topo->nodes[GPU].count; int nChannels = comm->nChannels; for (int c=0; cnodes[CPU].nodes+c; +static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) { + struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix; struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1; int l=0; // Node 1 -> CPU - for (int i=0; ipaths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i]; + for (int i=0; ipaths[tx][ix].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i]; // CPU -> Node 2 for (int i=0; ipaths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i]; // Update path characteristics srcNode->paths[t2][i2].count = l; - srcNode->paths[t2][i2].type = std::max(srcNode->paths[CPU][c].type, cpuNode->paths[t2][i2].type); - srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width); + srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type); + if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN; + srcNode->paths[t2][i2].width = std::min(srcNode->paths[tx][ix].width, cpuNode->paths[t2][i2].width); return ncclSuccess; } @@ -240,6 +241,8 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE return ncclSuccess; } +NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0); + int ncclTopoUserP2pLevel = -1; ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) { *p2p = 0; @@ -255,13 +258,14 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_ return ncclSuccess; } - + int intermediateIndex = -1; // Set intermediate GPU rank, if routing through an intermediate GPU. struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2; if (path->count == 2) { struct ncclTopoNode* intermediateNode = path->list[0]->remNode; - if (intermediateNode->type == GPU && intermediateRank) { - *intermediateRank = intermediateNode->gpu.rank; + if (intermediateNode->type == GPU) { + intermediateIndex = intermediateNode - system->nodes[GPU].nodes; + if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank; } } @@ -291,6 +295,41 @@ compare: // Compute the PCI distance and compare with the p2pLevel. if (path->type <= p2pLevel) *p2p = 1; +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#else + if (*p2p == 1) { + // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to + // validate against NVML at all since they are pretending to be on other hw. + if (g1 != g2 && ncclParamIgnoreDisabledP2p() != 2) { + int indexes[3] = {-1,-1,-1}; + int verticeN = 0; + NCCLCHECK(ncclNvmlEnsureInitialized()); + + indexes[verticeN++] = system->nodes[GPU].nodes[g1].gpu.dev; + if (intermediateIndex != -1) indexes[verticeN++] = system->nodes[GPU].nodes[intermediateIndex].gpu.dev; + indexes[verticeN++] = system->nodes[GPU].nodes[g2].gpu.dev; + + for (int i=1; i < verticeN; i++) { + nvmlGpuP2PStatus_t status; + status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusRead; + bool good = status == NVML_P2P_STATUS_OK; + status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite; + good &= status == NVML_P2P_STATUS_OK; + if (!good) { + if (ncclParamIgnoreDisabledP2p()) { + *p2p = 0; + } else if (path->type <= PATH_NVB) { + WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]); + return ncclUnhandledCudaError; + } else if (path->type < PATH_SYS) { + INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]); + } + } + } + } + } +#endif + if (path->type == PATH_NVL) { struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2; // Enable P2P Read for Ampere/NVLink only @@ -359,6 +398,14 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int } int distance = gpu->paths[NET][n].type; + if (distance == PATH_PXN) { + // In case of PXN, use the intermediate GPU distance instead + int proxyRank, g; + NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank)); + NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g)); + struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g; + distance = proxyGpu->paths[NET][n].type; + } if (distance > netGdrLevel) { INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel); return ncclSuccess; @@ -369,6 +416,77 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int return ncclSuccess; } +ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank) { + // Get GPU and NET + int n, g; + NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n)); + NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + struct ncclTopoLinkList* path = gpu->paths[NET]+n; + if (path->type == PATH_PXN) { + struct ncclTopoNode* node; + int type = NVS; + for (int i=0; icount && type == NVS; i++) { + node = path->list[i]->remNode; + type = node->type; + } + if (type != GPU) { + WARN("Could not find intermediate GPU between GPU rank %d and NIC %d\n", rank, netDev); + return ncclInternalError; + } + *intermediateRank = node->gpu.rank; + } else { + *intermediateRank = rank; + } + return ncclSuccess; +} + +NCCL_PARAM(PxnDisable, "PXN_DISABLE", 1); + +// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use +// remote proxies without risking deadlocks +int ncclPxnDisable() { + static int pxnDisable = -1; + if (pxnDisable == -1) { + if (ncclNetVersion() == 4) { + INFO(NCCL_INIT, "PXN Disabled as plugin is v4"); + pxnDisable = 1; + } else { + pxnDisable = ncclParamPxnDisable(); + } + } + return pxnDisable; +} + +ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks) { + struct ncclTopoSystem* system = comm->topo; + *nranks = 0; + *intermediateRanks = NULL; + if (system->nodes[NET].count == 0) return ncclSuccess; + + int nr = 0; + int* ranks = NULL; + for (int rank=0; ranknRanks; rank++) { + int netDev, proxyRank; + NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank)); + if (proxyRank == comm->rank) continue; + int useGdr; + NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr)); + if (useGdr == 0) continue; + int found = 0; + for (int r=0; rnodes[GPU].count; g++) { + // Check whether we can access the NIC through another NVLink-connected GPU (PXN) + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) { + for (int p=0; pnodes[GPU].count; p++) { + if (p == g) continue; + struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p; + + // To ensure proper balancing, use only a local GPU which advertised that NIC as its preferred one. + int netDev; + NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev)); + // Make sure we can allocate memory on that GPU. + if (netDev != netNode->id) continue; + + // PXN = PCI + NVLink. + if (netNode->paths[GPU][p].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue; + + // We can use that GPU as relay to communicate with that NIC. + // Only enabling it in the GPU->NIC direction for now to favor + // receiving locally and sending remotely (consistent with net.cc) + NCCLCHECK(addInterStep(system, GPU, p, GPU, g, NET, n)); + break; + } + } // Update path when we dont want to / can't use GPU Direct RDMA. int gdr; NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr)); @@ -427,8 +568,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU int localCpu; NCCLCHECK(getLocalCpu(system, g, &localCpu)); - NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g)); - NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n)); + NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g)); + NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n)); } } } @@ -499,9 +640,9 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) { int gdr, ret = 1; - int64_t net; + int net; for (int g = 0; g < system->nodes[GPU].count; g++) { - NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, &net, 0)); + NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, &net)); NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr)); if (!gdr) { ret = 0; @@ -544,6 +685,8 @@ void ncclTopoFree(struct ncclTopoSystem* system) { free(system); } +NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 2); + static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) { int peer; struct ncclTopoLinkList* path = NULL; @@ -563,7 +706,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /* } } else { // Remote rank, use network - *nChannels = 1; + *nChannels = ncclParamNChannelsPerNetPeer(); } return ncclSuccess; } diff --git a/src/graph/search.cc b/src/graph/search.cc index 47c0de1009..4a5715d470 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -257,10 +257,10 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time); // Try to keep all searchs within one second -#define NCCL_SEARCH_GLOBAL_TIMEOUT (3ULL<<19) -#define NCCL_SEARCH_TIMEOUT (1<<18) -#define NCCL_SEARCH_TIMEOUT_TREE (1<<17) -#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<10) +#define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<18) +#define NCCL_SEARCH_TIMEOUT (1<<14) +#define NCCL_SEARCH_TIMEOUT_TREE (1<<14) +#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8) #define FORCED_ORDER_PCI 1 #define FORCED_ORDER_REPLAY 2 @@ -341,6 +341,57 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop return ncclSuccess; } +// Build a list of the best NETs to try. +// +// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu +// index when trying to get back to the NIC. +// +// The list is built the following way: +// 1. Select NETs starting with those close to GPU(s), based on paths[n].type. +// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list +// based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which +// might have been choosen by GPU 0 (case with multiple independent communicators per node) +// 3. Then add the NETs to the final list if they were not already added by another closer GPU. + +ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) { + int netCount = 0; + int localNetCount; + int* localNets; + NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count)); + + for (int t=0; t <= typeInter; t++) { + for (int g=0; gnodes[GPU].count; g++) { + if (gpu != -1 && gpu != g) continue; + localNetCount = 0; + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + struct ncclTopoLinkList* paths = gpu->paths[NET]; + for (int n=0; nnodes[NET].count; n++) { + if (paths[n].type == t) localNets[localNetCount++] = n; + } + if (localNetCount == 0) continue; + // Shuffle by gpu NVML device number so that GPUs on the same PCI switch + // with multiple NICs don't use the same one as first choice. + for (int r=0; rnodes[GPU].nodes[g].gpu.dev % localNetCount; r++) { + int net0 = localNets[0]; + for (int i=0; iinter[graph->nChannels*2], &startNetIndex)); struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; - for (int n=0; nnodes[NET].count; n++) { + int netcount; + int* nets; + NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); + NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netcount)); + for (int i=0; inodes[NET].nodes+n; if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue; @@ -395,6 +451,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo graph->speedInter = speedInterSave; } } + free(nets); } } else if (step < system->nodes[GPU].count-1) { // Go to next GPU @@ -429,65 +486,12 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo return ncclSuccess; } -// Select only NICs with the maximum bandwidth w.r.t. GPUs, and sort them by distance. -ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int* nets, int* netcountRet) { - float* maxwidths; - int* minhops; - int netcount = 0; - NCCLCHECK(ncclCalloc(&minhops, system->nodes[NET].count)); - NCCLCHECK(ncclCalloc(&maxwidths, system->nodes[NET].count)); - for (int n=0; nnodes[NET].count; n++) { - maxwidths[n] = 0.0; - minhops[n] = 255; - struct ncclTopoNode* net = system->nodes[NET].nodes+n; - struct ncclTopoLinkList* paths = net->paths[GPU]; - for (int g=0; gnodes[GPU].count; g++) { - if (paths[g].width > maxwidths[n] || (paths[g].width == maxwidths[n] && paths[g].count < minhops[n])) { - maxwidths[n] = paths[g].width; - minhops[n] = paths[g].count; - } - } - if (netcount && maxwidths[nets[0]] > maxwidths[n]) continue; // Do not keep NICs with lower BW - if (netcount && maxwidths[nets[0]] < maxwidths[n]) netcount = 0; // Remove all NICs with lower BW - int index; - for (index = 0; index < netcount; index++) { - if (minhops[n] < minhops[nets[index]]) break; - } - // Insert net at index - // Shift all nets with higher nhops - for (int i = netcount; i>index; i--) nets[i] = nets[i-1]; - // Insert this net at index - nets[index] = n; - netcount++; - } - - *netcountRet = netcount; - - // Then shuffle NICs with the same nhops based on the GPU device number, so that when we have - // 2 NICs and 2 GPUs and create communicators with only one GPU, we will use both NICs. - for (int start = 0; start < netcount;) { - int end = start+1; - while (end < netcount && minhops[nets[end]] == minhops[nets[start]]) end++; - // Shuffle - for (int r=0; rnodes[GPU].nodes[0].gpu.dev % (end-start); r++) { - int netStart = nets[start]; - for (int i=start; ispeedInter; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); int netcount; - NCCLCHECK(ncclTopoSelectNets(system, nets, &netcount)); + NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount)); for (int i=0; inodes[NET].nodes+n; @@ -497,6 +501,8 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo if (net->net.maxChannels == 0) continue; graph->inter[graph->nChannels*2] = net->id; + graph->latencyInter = net->net.latency; + for (int i=0; inodes[NET].count; i++) { if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) { @@ -637,7 +643,18 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra /* User defined graph from XML file */ /************************************/ -struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB}, { "LOC", PATH_LOC }, { NULL, 0 } }; +struct kvDict kvDictLinkType[] = { + { "LOC", PATH_LOC }, + { "NVL", PATH_NVL }, + { "NVB", PATH_NVB }, + { "PIX", PATH_PIX }, + { "PXB", PATH_PXB }, + { "PXN", PATH_PXN }, + { "PHB", PATH_PHB }, + { "SYS", PATH_SYS }, + { NULL, 0 } +}; + ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; int* inter = graph->inter+2*c; @@ -677,6 +694,7 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->speedIntra)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->speedInter)); + if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0; const char* str; NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str)); NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType)); @@ -735,6 +753,7 @@ ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTop NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels)); NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->speedIntra)); NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->speedInter)); + NCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter)); const char* str; NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType)); NCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str)); @@ -768,12 +787,14 @@ float speedArrayInter[] = { 48.0, 30.0, 24.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float)) RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0); -RCCL_PARAM(EnableMultipleSAT, "ENABLE_MULTIPLE_SAT", 0); +NCCL_PARAM(CrossNic, "CROSS_NIC", 2); ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; + graph->crossNic = ncclParamCrossNic(); int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0; graph->speedIntra = graph->speedInter = 0; + graph->latencyInter = 0; if (graph->crossNic == 2) graph->crossNic = 0; graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; graph->typeInter = PATH_PIX; @@ -821,8 +842,6 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph // limit single node max channels when searching ring graph on Rome graph->maxChannels = 2; } - if (graph->collNet && !rcclParamEnableMultipleSAT()) - graph->maxChannels = 1; if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; // SPLIT_TREE works better on older archs. @@ -890,19 +909,13 @@ search: goto search; } tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; - if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXB)) { + + if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { tmpGraph.typeInter += 1; goto search; } tmpGraph.typeInter = PATH_PIX; - // Try a simpler tree - if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) { - tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE; - goto search; - } - tmpGraph.pattern = graph->pattern; - if (crossNic && tmpGraph.crossNic == 0) { // Try again with crossNic if permitted tmpGraph.crossNic = crossNic; @@ -910,6 +923,13 @@ search: } tmpGraph.crossNic = 0; + // Try a simpler tree + if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) { + tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE; + goto search; + } + tmpGraph.pattern = graph->pattern; + // Decrease speed until we find a solution if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) { tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex]; @@ -1014,17 +1034,66 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru return ncclSuccess; } -ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* dev) { +// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation +NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2); + +#include "comm.h" +ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) { if (graph) { // Honor the net device in the graph int channel = channelId%graph->nChannels; - int ngpus = system->nodes[GPU].count; + int ngpus = comm->topo->nodes[GPU].count; int index = graph->intra[channel*ngpus] == rank ? 0 : 1; *dev = graph->inter[channel*2+index]; + NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank)); + } else if (peerRank == -1) { + return ncclInternalError; } else { - int64_t id; - NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, rr)); - *dev = id; + // Start with our local NIC and local Rank + NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev)); + *proxyRank = rank; + + int pxnLevel = ncclPxnDisable() == 1 ? 0 : ncclParamP2pPxnLevel(); + // See whether we can use the remote rank preferred device. + if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) { + int netDev = comm->peerInfo[peerRank].netDev; + int n; + // Check that device exists on our node + if (ncclParamCrossNic() == 0) { + if (ncclTopoIdToIndex(comm->topo, NET, netDev, &n) != ncclSuccess) { + WARN("Rank %d requires NIC %d but that NIC is not available for rank %d", peerRank, netDev, rank); + return ncclInvalidUsage; + } + *dev = netDev; + } + if (pxnLevel == 1) { + int g, n; + NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g)); + NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n)); + struct ncclTopoNode* gpu = comm->topo->nodes[GPU].nodes+g; + if (gpu->paths[NET][n].type <= PATH_PXN) { + *dev = netDev; + NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank)); + } + } else if (pxnLevel == 2) { + // Check whether we can access it through our node-local GPU for that NIC. + for (int r=0; rlocalRanks; r++) { + int peerRank = comm->localRankToRank[r]; + if (comm->peerInfo[peerRank].netDev == netDev) { + int g1, g2, n; + NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1)); + NCCLCHECK(ncclTopoRankToIndex(comm->topo, peerRank, &g2)); + NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n)); + struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2; + if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) { + *proxyRank = peerRank; + *dev = netDev; + return ncclSuccess; + } + } + } + } + } } return ncclSuccess; } diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 226217c1b7..7f13451227 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -22,11 +22,11 @@ const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) -const char* topoLinkTypeStr[] = { "LOC", "XGMI", "", "PCI", "", "", "SYS", "NET" }; -const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PHB", "SYS" }; +const char* topoLinkTypeStr[] = { "LOC", "XGMI", "", "PCI", "", "", "", "SYS", "NET" }; +const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" }; #else -const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "SYS", "NET" }; -const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" }; +const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "", "SYS", "NET" }; +const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" }; #endif /******************************************************************/ @@ -127,6 +127,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo n->net.asic = 0ULL; n->net.port = NCCL_TOPO_UNDEF; n->net.width = 0.0; + n->net.latency = 0.0; } *node = n; return ncclSuccess; @@ -338,13 +339,14 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s ncclDebugNoWarn = NCCL_GRAPH; int mbps; - if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0; + NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0)); if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1 net->net.width = mbps / 8000.0; - if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0; - if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0; - if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS; - if (ncclCollNet && xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0; + if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0; + NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0)); + NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0)); + NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS)); + NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0)); net->net.busId = busId; ncclDebugNoWarn = 0; @@ -653,6 +655,16 @@ static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attr } return ncclSuccess; } +static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) { + int index; + NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); + if (index == -1) { + index = node->nAttrs++; + strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); + snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value); + } + return ncclSuccess; +} ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { @@ -689,7 +701,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, // so we start with collnet so that it has precedence. int netDevCount = 0; - if (ncclCollNet) { + if (collNetSupport()) { NCCLCHECK(collNetDevices(&netDevCount)); for (int n=0; nwidth == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id; } + if (count == 0) { + *id = -1; + free(nets); + return ncclSuccess; + } + + int rr = system->nodes[GPU].nodes[g].gpu.dev; *id = nets[rr%count]; free(nets); return ncclSuccess; @@ -853,3 +873,14 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* if (ccMax) *ccMax = max; return ncclSuccess; } + +ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) { + for (int g=0; gnodes[GPU].count; g++) { + if (system->nodes[GPU].nodes[g].gpu.rank == rank) { + *localRank = g; + return ncclSuccess; + } + } + WARN("Could not find local GPU with rank %d\n", rank); + return ncclInternalError; +} diff --git a/src/graph/topo.h b/src/graph/topo.h index 321fb1e31d..56a468129d 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -45,9 +45,10 @@ extern const char* topoNodeTypeStr[]; // Skipping 2 for PATH_NVB #define LINK_PCI 3 // Skipping 4 for PATH_PXB -// Skipping 5 for PATH_PHB -#define LINK_SYS 6 -#define LINK_NET 7 +// Skipping 5 for PATH_PXN +// Skipping 6 for PATH_PHB +#define LINK_SYS 7 +#define LINK_NET 8 extern const char* topoLinkTypeStr[]; #define PATH_LOC 0 @@ -55,8 +56,10 @@ extern const char* topoLinkTypeStr[]; #define PATH_NVB 2 #define PATH_PIX 3 #define PATH_PXB 4 -#define PATH_PHB 5 -#define PATH_SYS 6 +#define PATH_PXN 5 +#define PATH_PHB 6 +#define PATH_SYS 7 +#define PATH_DIS 7 extern const char* topoPathTypeStr[]; struct ncclTopoNode; @@ -102,6 +105,7 @@ struct ncclTopoNode { uint64_t asic; int port; float width; + float latency; int gdrSupport; int collSupport; int maxChannels; @@ -149,8 +153,7 @@ ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id) ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width); ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system); ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system); - -ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr); +ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank); ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem); ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels); diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 63d0443a39..ff04c58444 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -110,7 +110,7 @@ static struct tuningModel tuning_model_1 { .bwRatio = { /* 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.12, 1.00, 0.99 }, /* Ring (LL/LL128/Simple)*/ { 0.12, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } }, /* more than 2 nodes */ { /* Tree (LL/LL128/Simple)*/ { 0.15, 1.00, 0.42 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 1.00, 1.00 }, /* CollNet (LL/LL128/Simple)*/ { 1.00, 1.00, 1.00 } }, }, @@ -213,8 +213,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads); comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS); - comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = - getNthreads("NCCL_NTHREADS", ncclParamNthreads(), NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS); + comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS); #endif @@ -246,7 +245,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) : coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 : nRanks; - int nInterSteps = coll == ncclFuncAllReduce ? 2*(nNodes-1) : + int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) : coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 : nNodes; @@ -269,7 +268,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh*graphs[a]->nChannels); if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw); - if (a == NCCL_ALGO_COLLNET) busBw *= .9; if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), ll128MaxBwPerCh*graphs[a]->nChannels); #endif if (a == NCCL_ALGO_COLLNET && p != NCCL_PROTO_SIMPLE) busBw = 0; // Oneshot CollNet only supports Simple diff --git a/src/graph/xml.cc b/src/graph/xml.cc index 6b2c2e2e65..9b4a610aed 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -617,7 +617,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); if (busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess) dev = -1; } else { - NCCLCHECK(wrapNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev)); + NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev)); } NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev)); } @@ -632,7 +632,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm CUDACHECK(hipGetDeviceProperties(&devProp, dev)); cudaMajor = devProp.major; cudaMinor = devProp.minor; } else { - NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor)); + NCCLCHECK(ncclNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor)); } NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor)); } @@ -703,15 +703,15 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm for (int l=0; lret = (a)) != ncclSuccess) { \ - INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ - return args; \ - } \ -} while(0) - -#define CUDACHECKTHREAD(a) do { \ - if ((a) != hipSuccess) { \ - INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ - args->ret = ncclUnhandledCudaError; \ - return args; \ - } \ -} while(0) - void* ncclAsyncThreadMain(void* args_) { struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_; NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev)); @@ -120,18 +105,23 @@ ncclResult_t ncclGroupStart() { return ncclSuccess; } -static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, - void* recvbuff, ssize_t sendbytes, const void* sendbuff, uint16_t sendIdx, uint16_t recvIdx) { - struct ncclInfo info = { ncclFuncSendRecv, "SendRecv", - sendbuff, recvbuff, (size_t)std::max(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */ +static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) { + struct ncclInfo info = { ncclFuncSend, "Send", + NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */ 1, 1 }; - info.delta = delta; info.channelId = channelId; - info.sendbytes = sendbytes; - info.recvbytes = recvbytes; - info.sendIdx = sendIdx; - info.recvIdx = recvIdx; - if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage; + info.opCount = opCount; + info.connIndex = connIndex; + NCCLCHECK(ncclSetupP2pKernel(&info)); + return ncclSuccess; +} +static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) { + struct ncclInfo info = { ncclFuncRecv, "Recv", + NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */ + 1, 1 }; + info.channelId = channelId; + info.opCount = opCount; + info.connIndex = connIndex; NCCLCHECK(ncclSetupP2pKernel(&info)); return ncclSuccess; } @@ -203,15 +193,15 @@ ncclResult_t ncclGroupEnd() { for (int i=0; ifuncType == ASYNC_FUNC_COLL && args->coll.comm->connect[0]) { - args->coll.connIndex = 0; + if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[1]) { + args->coll.connIndex = 1; pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args); } } for (int i=0; ifuncType == ASYNC_FUNC_COLL && args->coll.comm->connect[0]) { + if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[1]) { int err = pthread_join(ncclGroupThreads[i], NULL); if (err != 0) { WARN("Error waiting for pthread_join : %s", strerror(errno)); @@ -219,7 +209,7 @@ ncclResult_t ncclGroupEnd() { } INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize); NCCLCHECKGOTO(args->ret, ret, end); - args->coll.comm->connect[0] = 0; + args->coll.comm->connect[1] = 0; } } @@ -271,18 +261,31 @@ ncclResult_t ncclGroupEnd() { int index = 0; int delta = deltas[index]; sched_delta: - uint32_t from = (rank+nRanks-delta)%nRanks; - uint32_t to = (rank+delta)%nRanks; - struct ncclP2Pinfo* recv = comm->p2pRecvs[from] ? comm->p2pRecvs[from]->getNext() : NULL; - struct ncclP2Pinfo* send = comm->p2pSends[to] ? comm->p2pSends[to]->getNext() : NULL; + uint32_t recvPeer = (rank+nRanks-delta)%nRanks; + uint32_t sendPeer = (rank+delta)%nRanks; + struct ncclP2Pinfo* recv = comm->p2pRecvs[recvPeer] ? comm->p2pRecvs[recvPeer]->getNext() : NULL; + struct ncclP2Pinfo* send = comm->p2pSends[sendPeer] ? comm->p2pSends[sendPeer]->getNext() : NULL; if (recv != NULL || send != NULL) { ssize_t totRecvBytes = -1, totSendBytes = -1; if (recv != NULL) totRecvBytes = recv->nbytes; if (send != NULL) totSendBytes = send->nbytes; + if (recv) comm->p2pRecvCount--; + if (send) comm->p2pSendCount--; + if (recvPeer == comm->rank) { // Check self send/recv + if (sendPeer != comm->rank) { WARN("Sendrecv schedule not aligned for self"); ret = ncclInternalError; goto group_cleanup; } + if (send && recv == NULL) { WARN("Trying to send to self without a matching recv"); ret = ncclInvalidUsage; goto group_cleanup; } + if (send == NULL && recv) { WARN("Trying to recv to self without a matching send"); ret = ncclInvalidUsage; goto group_cleanup; } + } + void* recvBuff = recv ? recv->buff : NULL; + void* sendBuff = send ? send->buff : NULL; + // After we recycle p2pSend/Recv, we're no longer allowed to dereference send or recv, only use them as boolean NULL/not NULL. + if (recv && comm->p2pRecvs[recvPeer]->peakNext() == NULL) comm->p2pRecvs[recvPeer]->recycle(); + if (send && comm->p2pSends[sendPeer]->peakNext() == NULL) comm->p2pSends[sendPeer]->recycle(); + ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize); ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize); - uint16_t sendIdx = 0, recvIdx = 0; + uint16_t sendIdx = 1, recvIdx = 1; if(comm->p2pNet && totSendBytes > rcclParamP2pNetThreshold()) sendIdx = NCCL_CONN_IDX_P2P_NET; if(comm->p2pNet && totRecvBytes > rcclParamP2pNetThreshold()) @@ -299,23 +302,20 @@ sched_delta: if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; } if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; } // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested - // (total size == 0), otherwise set size to -1 so that the kernel skips the operation. - if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1; - if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1; - if (sendbytes >= 0 || recvbytes >= 0) { - NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId, - recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL, - sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL, sendIdx, recvIdx), ret, group_cleanup); + // (total size == 0), otherwise set size to -1. + if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL; + if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL; + if (recv) { + NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, channelId, recvbytes, ((char*)(recv->buff))+recvOffset, recv->opCount, recvIdx), ret, group_cleanup); + } + if (send) { + NCCLCHECKGOTO(scheduleSend(comm, sendPeer, channelId, sendbytes, ((char*)(send->buff))+sendOffset, send->opCount, sendIdx), ret, group_cleanup); } recvOffset += recvChunkSize; sendOffset += sendChunkSize; chunk++; } while (sendRemaining || recvRemaining); - if (recv) comm->p2pRecvCount--; - if (send) comm->p2pSendCount--; } - if (recv == NULL && comm->p2pRecvs[from]) comm->p2pRecvs[from]->recycle(); - if (send == NULL && comm->p2pSends[to]) comm->p2pSends[to]->recycle(); index++; if (index == 1 && deltas[1] == deltas[0]) index++; if (index == 2 && deltas[2] == deltas[0]) index++; @@ -421,16 +421,6 @@ group_cleanup: } comm->p2pSendCount = comm->p2pRecvCount = 0; } - /* Free all proxy ops in state->nextOps */ - struct ncclProxyState* state = &comm->proxyState; - pthread_mutex_lock(&state->poolMutex); - for (struct ncclProxyArgs *op = state->nextOps; op; op = op->next) { - op->next = state->pool; - state->pool = op; - } - pthread_mutex_unlock(&state->poolMutex); - state->nextOps = NULL; - ncclLaunchReset(comm); } } diff --git a/src/include/alloc.h b/src/include/alloc.h index 49ef05d1e0..94c6b65627 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -31,16 +31,37 @@ static inline ncclResult_t ncclCudaHostFree(void* ptr) { } template -static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { +static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { void* p = malloc(nelem*sizeof(T)); if (p == NULL) { WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); return ncclSystemError; } + //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p); memset(p, 0, nelem*sizeof(T)); *ptr = (T*)p; return ncclSuccess; } +#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__) + +template +static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { + if (nelem < oldNelem) return ncclInternalError; + if (nelem == oldNelem) return ncclSuccess; + + T* oldp = *ptr; + T* p = (T*)malloc(nelem*sizeof(T)); + if (p == NULL) { + WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); + return ncclSystemError; + } + memcpy(p, oldp, oldNelem*sizeof(T)); + free(oldp); + memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T)); + *ptr = (T*)p; + INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr); + return ncclSuccess; +} struct __attribute__ ((aligned(64))) allocationTracker { union { diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 6f3f02cdb4..76a2b5a14b 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,18 +9,17 @@ #define NCCL_BOOTSTRAP_H_ #include "nccl.h" +#include "comm.h" ncclResult_t bootstrapNetInit(); ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv); ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out); -ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState, int* rootPid); // [RCCL] Adding rootPid +ncclResult_t bootstrapInit(ncclUniqueId* id, struct ncclComm* comm); ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size); ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag); ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size); -ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr); -ncclResult_t bootstrapRemFree(int id, int rank, void* commState); ncclResult_t bootstrapClose(void* commState); ncclResult_t bootstrapAbort(void* commState); #endif diff --git a/src/include/checks.h b/src/include/checks.h index 98cf164133..6efc2cf663 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -61,6 +61,49 @@ } \ } while(true) +#define SYSCHECKGOTO(statement, res, label) do { \ + if ((statement) == -1) { \ + /* Print the back trace*/ \ + res = ncclSystemError; \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ +} while (0); + +#define NEQCHECK(statement, value) do { \ + if ((statement) != value) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \ + return ncclSystemError; \ + } \ +} while (0); + +#define NEQCHECKGOTO(statement, value, res, label) do { \ + if ((statement) != value) { \ + /* Print the back trace*/ \ + res = ncclSystemError; \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ +} while (0); + +#define EQCHECK(statement, value) do { \ + if ((statement) == value) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \ + return ncclSystemError; \ + } \ +} while (0); + +#define EQCHECKGOTO(statement, value, res, label) do { \ + if ((statement) == value) { \ + /* Print the back trace*/ \ + res = ncclSystemError; \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ +} while (0); + // Propagate errors up #define NCCLCHECK(call) do { \ ncclResult_t res = call; \ @@ -80,4 +123,39 @@ } \ } while (0); +#define NCCLWAIT(call, cond, abortFlagPtr) do { \ + volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ + ncclResult_t res = call; \ + if (res != ncclSuccess) { \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + return ncclInternalError; \ + } \ + if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \ +} while (!(cond)); + +#define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \ + volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ + res = call; \ + if (res != ncclSuccess) { \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ + if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \ +} while (!(cond)); + +#define NCCLCHECKTHREAD(a) do { \ + if ((args->ret = (a)) != ncclSuccess) { \ + INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ + return args; \ + } \ +} while(0) + +#define CUDACHECKTHREAD(a) do { \ + if ((a) != hipSuccess) { \ + INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ + args->ret = ncclUnhandledCudaError; \ + return args; \ + } \ +} while(0) + #endif diff --git a/src/include/coll_net.h b/src/include/coll_net.h index 0d17b76036..c2d831e916 100644 --- a/src/include/coll_net.h +++ b/src/include/coll_net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -29,6 +29,6 @@ static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; } static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; } -static int collNetSupport() { return ncclCollNet != NULL ? 1 : 0; } +static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; } #endif diff --git a/src/include/collectives.h b/src/include/collectives.h index 518643cd14..db8fc99fd3 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -37,8 +37,8 @@ struct ncclDevRedOpFull { /* Declare all collective operations */ #define DECL5(func, algo, proto, devredop, type) \ - extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(struct ncclWorkElem* args); \ - extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(ncclWorkElem c); \ + extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \ + extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \ #define CONCAT(a,b) a##b #define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f) @@ -96,18 +96,18 @@ DECL(AllReduce) DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) DECL5(AllToAllPivot, RING, SIMPLE, Sum, int8_t) -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)(struct ncclWorkElem* args); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)(struct ncclWorkElem* args); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)(struct ncclWorkElem* args); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)(struct ncclWorkElem* args); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)(struct ncclWorkElem* args); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)(struct ncclWorkElem* args); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)(struct ncclWorkElem* args); +extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)(); +extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)(); +extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)(); +extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)(); +extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)(); +extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)(); +extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)(); #if defined(RCCL_BFLOAT16) -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, rccl_bfloat16)(struct ncclWorkElem* args); +extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, rccl_bfloat16)(); #endif -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)(struct ncclWorkElem* args); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)(struct ncclWorkElem* args); +extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)(); +extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)(); // CHUNKSIZE must be a multiple of SLICESIZE //#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) diff --git a/src/include/comm.h b/src/include/comm.h index 8c7ab14ff1..bdcfbf69ac 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,7 +11,7 @@ #include "transport.h" #include "p2p.h" // [RCCL] -#include "clique/CliqueManager.h" +//#include "clique/CliqueManager.h" // [/RCCL] #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) @@ -40,8 +40,6 @@ struct cudaLaunchParams { #define NCCL_LL128_THREAD_THRESHOLD 8 #define NCCL_SIMPLE_THREAD_THRESHOLD 64 -#define NCCL_MAX_INTRA_RANKS 32 - struct ncclSendMem { union { struct { @@ -50,10 +48,10 @@ struct ncclSendMem { void* ptrExchange; uint64_t redOpArgExchange[2]; char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)]; + int offsFifo[NCCL_STEPS]; }; char pad3[MEM_ALIGN]; }; - char buff[1]; // Actually larger than that }; struct ncclRecvMem { @@ -62,18 +60,18 @@ struct ncclRecvMem { uint64_t tail; char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; int sizesFifo[NCCL_STEPS]; - void* ptrsFifo[NCCL_STEPS]; + int offsFifo[NCCL_STEPS]; + int flush; // For GDRCopy-based flush }; char pad4[MEM_ALIGN]; }; - char buff[1]; // Actually larger than that }; typedef hipError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*); enum helperThreadState {ThreadStart, ThreadStop}; -#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_INTRA_RANKS*NCCL_MAX_OPS) +#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS) struct ncclGraphHelperResources { ncclComm* comm; @@ -91,6 +89,11 @@ struct ncclUserRedOp { ncclDevRedOpFull opFull; }; +struct ncclNodeRanks { + int localRanks; + int* localRankToRank; +}; + struct ncclComm { struct ncclChannel channels[MAXCHANNELS]; @@ -108,15 +111,18 @@ struct ncclComm { int cudaDev; // my cuda device index int64_t busId; // my PCI bus ID in int format cpu_set_t cpuAffinity; // CPU affinity of the GPU + int WarpSize; int node; int nNodes; - - // Intra-node rank info - int intraNodeGlobalRanks[NCCL_MAX_INTRA_RANKS]; + int localRank; int localRanks; - int intraNodeRank; - int8_t* rankToIntraNodeRank; + int maxLocalRanks; + int* rankToNode; + int* rankToLocalRank; + int* localRankToRank; + // localRanks and localRanktoRank for all nodes + struct ncclNodeRanks* nodeRanks; enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode; hipStream_t userStream; @@ -176,14 +182,13 @@ struct ncclComm { // Storage for deferred intra-process launch hipLaunchParams * intraParams; hipLaunchParams *myParams; + pthread_t* intraThreads; int* intraCudaDevs; int* intraCGMode; // Whether we can use CUDA9 CGMD or not int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not struct ncclWorkElem args; - void* argsptr; + void* argsptrs[2]; - // Global proxy thread - pthread_t proxyThread; struct ncclProxyState proxyState; // Whether this communicator uses collNet @@ -205,8 +210,8 @@ struct ncclComm { int p2pRecvCount; // [RCCL] - CliqueManager* cliqueManager; // CliqueManager handles pointer collection / distribution for clique-based kernels - int rootPid; // Process ID of root + //CliqueManager* cliqueManager; // CliqueManager handles pointer collection / distribution for clique-based kernels + //int rootPid; // Process ID of root // [/RCCL] // Store info for cudaGraph diff --git a/src/include/debug.h b/src/include/debug.h index 6ce90ee375..7af38fd53d 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -16,6 +16,9 @@ #include #include +// Conform to pthread and NVTX standard +#define NCCL_THREAD_NAMELEN 16 + extern int ncclDebugLevel; extern uint64_t ncclDebugMask; extern pthread_mutex_t ncclDebugOutputLock; @@ -37,4 +40,6 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch; #define TRACE(...) #endif +void ncclSetThreadName(pthread_t thread, const char *fmt, ...); + #endif diff --git a/src/include/devcomm.h b/src/include/devcomm.h index 26936f530b..22cbd94c5d 100644 --- a/src/include/devcomm.h +++ b/src/include/devcomm.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -13,7 +13,7 @@ #include "align.h" #include // [RCCL] Support for clique-based kernels -#include "clique/CliqueCommon.h" +//#include "clique/CliqueCommon.h" // [/RCCL] // Convert volatile access to atomic @@ -27,7 +27,7 @@ #define NCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now -typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t; +typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t; extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2]; #define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet @@ -114,7 +114,7 @@ struct ncclConnInfo { uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case int *sizesFifo; // Sizes fifo from GPU to proxy - void* *ptrsFifo; // Buffer fifo from proxy to GPU + int *offsFifo; // Buffer fifo from proxy to GPU uint64_t step; // Keep where we are uint64_t llLastCleaning; @@ -126,10 +126,16 @@ struct ncclConnInfo { uint32_t* curr_hdp_reg; // Current GPU's HDP register }; +struct ncclProxyConnector { + int rank; + int localRank; + struct ncclProxyConnection* connection; + struct ncclComm* comm; +}; + struct ncclConnector { int connected; - struct ncclProxyArgs *proxyAppend; - struct ncclProxyArgs **proxyAppendPtr; + struct ncclProxyConnector proxyConn; struct ncclTransportComm* transportComm; void* transportResources; struct ncclConnInfo conn; @@ -180,90 +186,98 @@ struct ncclDevComm; #pragma pack(push) /* push current alignment to stack */ #pragma pack(8) /* set alignment to 8 bytes boundary */ -#define NCCL_MAX_WORK_ELEMENTS 1 -#define NCCL_MAX_GROUPS (NCCL_MAX_NTHREADS/WARP_SIZE) - /* ncclWork is to be a power of two, currently 8x64 bytes, */ /* to make sure reads to host from the CUDA kernel are aligned. */ /* Make sure to adjust padding at the end of ncclWorkElem. */ -struct ncclWorkElem { - // Header - struct ncclDevComm* comm; - uint16_t nThreads; +#define NCCL_WORK_SIZE 256 + +enum ncclWorkElemType : uint8_t { + ncclWorkTypeUnused=0, + ncclWorkTypeColl=1, + ncclWorkTypeP2p=2, + ncclWorkTypeRegColl=3 +}; +enum ncclWorkElemSubType : uint8_t { + ncclWorkSubTypeUnused =0, + ncclWorkSubTypeSend, + ncclWorkSubTypeRecv +}; + +struct ncclWorkElemHeader { uint16_t funcIndex; + enum ncclWorkElemType type; + uint8_t nWarps:5; + uint8_t isLast:1; +}; + +struct ncclWorkElem { + struct ncclWorkElemHeader header; uint8_t regUsed; uint8_t direct; - uint8_t active, redOpArgIsPtr; + uint8_t redOpArgIsPtr; + uint8_t pad_0; const void * sendbuff; void * recvbuff; - // Op-specific fields. + size_t count; union { - struct { - size_t count; - union { - size_t lastChunkSize; - // Pivot A2A kernel computes chunk size itself. - // Instead, it needs the number of bidirectional rings. - size_t pivotA2ANumBiRings; - }; - uint64_t redOpArg; - uint16_t root; - uint8_t bid; - uint8_t nChannels; - uint16_t connIndex; - uint16_t opCount; - } coll; - struct { - size_t sendCount; - size_t recvCount; - int sendChunkSize; - int recvChunkSize; - int32_t delta; - union { - struct { - uint16_t nThreads:12; - uint16_t sendIdx:2; - uint16_t recvIdx:2; - }; - uint16_t padding; - }; - uint16_t opCount; - } p2p; - // [RCCL] Clique-based arguments - // NOTE: Follows same field structure as coll - // because nChannels is accessed from "coll" struct. - struct { - size_t count; - cliqueDevicePtrs_t* ptrs; - uint64_t unused_1; - uint16_t unused_2; - uint8_t bid; - uint8_t nChannels; - } clique; - // [/RCCL] - uint64_t align[4]; + size_t lastChunkSize; + // Pivot A2A kernel computes chunk size itself. + // Instead, it needs the number of bidirectional rings. + size_t pivotA2ANumBiRings; }; + uint32_t root; + uint8_t bid; + uint8_t nChannels; + uint16_t connIndex; + uint64_t redOpArg; + uint64_t opCount; }; -static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size"); +static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElem) == 0, "ncclWorkElem size must be a multiple of ncclWork size"); -struct ncclWorkRegElem { +struct ncclWorkElemP2p { + struct ncclWorkElemHeader header; + int32_t peer; + void* buff; + size_t count; + int chunkSize; + uint8_t ngroups:4; + uint8_t warpStart:4; + uint8_t nWarps:4; + enum ncclWorkElemSubType subType:4; + uint16_t opCount:12; + uint16_t connIndex:4; +}; +static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemP2p) == 0, "ncclWorkElemP2p size must be a multiple of ncclWork size"); + +struct ncclWorkElemReg { struct ncclWorkElem elem; void* dnInputs[NCCL_MAX_DIRECT_ARITY+1]; void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1]; void* upOutputs[NCCL_MAX_DIRECT_ARITY+1]; }; -#define NCCL_REG_ELEM_FACTOR 4 -static_assert(sizeof(struct ncclWorkRegElem) == (NCCL_REG_ELEM_FACTOR*sizeof(struct ncclWorkElem)), "ncclWorkRegElem size must be pow2 times ncclWorkElem size"); +static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemReg) == 0, "ncclWork size must be a multiple of ncclWorkElemReg size"); +static_assert(sizeof(struct ncclWorkElemReg) % sizeof(struct ncclWorkElem) == 0, "ncclWorkElemReg size must be a multiple of ncclWorkElem size"); + +#define NCCL_MAX_WORK_ELEMENTS 1 +#define NCCL_MAX_WORK_ELEMENTS_P2P 2 +#define NCCL_MAX_WORK_ELEMENTS_REG (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemReg)) +// Number of named barriers supported by CUDA +#define NCCL_MAX_GROUPS (NCCL_MAX_NTHREADS/WARP_SIZE) struct ncclWork { union { + char pad[NCCL_WORK_SIZE]; + struct ncclWorkElemHeader header; struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS]; - struct ncclWorkRegElem regElems[NCCL_MAX_WORK_ELEMENTS/NCCL_REG_ELEM_FACTOR]; + struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P]; + struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG]; }; }; +static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "ncclWork size needs to be well aligned"); + struct ncclChannel { union { struct { @@ -309,10 +323,9 @@ static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must struct ncclProfElem { union { struct { + uint64_t opCount; uint64_t total_cycle; uint64_t wait_cycle; // total wait cycle - uint64_t wait_send_cycle; - uint64_t wait_recv_cycle; // primtive cycles uint64_t send_cycle; uint64_t directSend_cycle; @@ -341,22 +354,26 @@ struct ncclProfElem { uint64_t directRecvReduceCopySend_byte; }; int data[0x80]; - }; + } elem[MAXCHANNELS]; }; struct ncclProf { - struct ncclProfElem elems[MAXCHANNELS]; + struct ncclProfElem* elems; }; + +#define PROFILE_NUM_ITEMS 1024 #endif #ifdef ENABLE_COLLTRACE typedef enum { - ncclCollTraceNotReady, - ncclCollTraceKernelLaunchType, - ncclCollTraceKernelEndType, - ncclCollTraceCollEndType, - ncclCollTraceAbortType, - ncclCollTraceDataType + ncclCollTraceNotReady = 0, + ncclCollTraceKernelLaunchType = 1, + ncclCollTraceKernelEndType = 2, + ncclCollTraceCollLaunchType = 3, + ncclCollTraceAbortType = 4, + ncclCollTraceDataType = 5, + ncclCollTraceCollElemType = (1<<4), + ncclCollTraceP2pElemType = (1<<5), } ncclCollTraceDataType_t; struct ncclCollTrace { @@ -365,18 +382,24 @@ struct ncclCollTrace { int16_t funcIndex; uint32_t data_0; uint64_t timeStamp; - uint64_t opCount; + union { + uint64_t opCount; + uint32_t p2pOpCount[2]; + }; union { uint64_t data_1; struct { - uint16_t nThreads; + uint8_t nWarps; uint8_t bid; uint8_t nChannels; } coll; struct { - uint16_t nThreads; - uint16_t delta; - } p2p; + int16_t peer; + uint8_t ngroups:4; + uint8_t connIndex:4; + uint8_t warpStart:4; + uint8_t nWarps:4; + } p2p[2]; }; }; static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size"); @@ -397,7 +420,7 @@ struct ncclDevComm { #ifdef ENABLE_PROFILING // Profiling counters - struct ncclProf* devProf; + struct ncclProf devProf; #endif #ifdef ENABLE_COLLTRACE diff --git a/src/include/enqueue.h b/src/include/enqueue.h index c6674a83ef..d538a1da77 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -16,6 +16,7 @@ #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */ size_t ncclKernMaxLocalSize(); +ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut); ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast); ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm); @@ -32,17 +33,17 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph); ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph); struct ncclBuffRegInfo { - void* sendbuffsBase[NCCL_MAX_INTRA_RANKS]; - void* recvbuffsBase[NCCL_MAX_INTRA_RANKS]; - void* sendbuffs[NCCL_MAX_INTRA_RANKS]; - void* recvbuffs[NCCL_MAX_INTRA_RANKS]; + void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS]; + void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS]; + void* sendbuffs[NCCL_MAX_LOCAL_RANKS]; + void* recvbuffs[NCCL_MAX_LOCAL_RANKS]; int nBuffs; }; // Enqueue information (for kernel and proxy) for each operation struct ncclQueueElem { - struct ncclWorkElem work; - struct ncclProxyArgs proxyArgs; + struct ncclWork work; + struct ncclProxyOp proxyOp; struct ncclBuffRegInfo buffRegInfo; }; @@ -88,7 +89,7 @@ static void ncclDestroyQueueInfo(void* ptr) { // but currently the destroy function of CUDA objects does not allow CUDA API calls while (eqElem != NULL) { for (int i=0; ibuffRegInfo.nBuffs; i++) { - if (i == eqInfo->comm->intraNodeRank) continue; + if (i == eqInfo->comm->localRank) continue; CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i])); CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i])); } diff --git a/src/include/graph.h b/src/include/graph.h index 29f81864f7..4cfe9539a6 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -31,12 +31,15 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks); // Query topology -ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net); +ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank); ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); #define MAX_XGMI_INTER_GPUS 4 ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev); ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr); +int ncclPxnDisable(); +ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); +ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank); // Find CPU affinity ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity); @@ -54,6 +57,7 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu #define NCCL_TOPO_CPU_TYPE_YONGFENG 1 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count); +ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id); #define NCCL_TOPO_MAX_NODES 256 @@ -76,6 +80,7 @@ struct ncclTopoGraph { int nChannels; float speedIntra; float speedInter; + float latencyInter; int typeIntra; int typeInter; int sameChannels; diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h index 4ec1ac6d4e..63555baf80 100644 --- a/src/include/ibvwrap.h +++ b/src/include/ibvwrap.h @@ -4,7 +4,7 @@ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -328,7 +328,8 @@ enum ibv_access_flags { IBV_ACCESS_REMOTE_WRITE = (1<<1), IBV_ACCESS_REMOTE_READ = (1<<2), IBV_ACCESS_REMOTE_ATOMIC = (1<<3), - IBV_ACCESS_MW_BIND = (1<<4) + IBV_ACCESS_MW_BIND = (1<<4), + IBV_ACCESS_RELAXED_ORDERING = (1<<20), }; struct ibv_pd { @@ -1065,6 +1066,7 @@ ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd); ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access); struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); +ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr); ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context); ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel); diff --git a/src/include/info.h b/src/include/info.h index 08a80f69e7..b380389242 100644 --- a/src/include/info.h +++ b/src/include/info.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,7 +12,7 @@ #include "devcomm.h" #include "collectives.h" -typedef enum { +typedef enum : uint8_t { ncclPatternRing, ncclPatternRingTwice, ncclPatternPipelineFrom, @@ -20,7 +20,9 @@ typedef enum { ncclPatternTreeUp, ncclPatternTreeDown, ncclPatternTreeUpDown, - ncclPatternCollTreeUpDown + ncclPatternCollTreeUpDown, + ncclPatternSend, + ncclPatternRecv } ncclPattern_t; // Used to pass NCCL call information between functions @@ -33,7 +35,7 @@ struct ncclInfo { size_t count; ncclDataType_t datatype; ncclRedOp_t op; - int root; + int root; // peer for p2p operations ncclComm_t comm; hipStream_t stream; // Algorithm details @@ -49,14 +51,10 @@ struct ncclInfo { size_t nBytes; int nstepsPerLoop; int nchunksPerLoop; - ssize_t sendbytes; - ssize_t recvbytes; - int recvChunkSize; - int sendChunkSize; - uint32_t delta; + int chunkSize; int channelId; - uint16_t sendIdx; - uint16_t recvIdx; + uint16_t connIndex; + uint64_t opCount; }; #endif diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h index 389c1eaa93..ce616724cd 100644 --- a/src/include/nccl_net.h +++ b/src/include/nccl_net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,7 +10,7 @@ #include "nccl.h" #include -#define NCCL_NET_HANDLE_MAXSIZE 64 +#define NCCL_NET_HANDLE_MAXSIZE 128 #define NCCL_PTR_HOST 0x1 #define NCCL_PTR_CUDA 0x2 @@ -31,10 +31,114 @@ typedef struct { int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA int speed; // Port speed in Mbps. int port; // Port number. + float latency; // Network latency int maxComms; // Maximum number of comms we can create -}ncclNetProperties_v4_t; + int maxRecvs; // Maximum number of grouped receives. +}ncclNetProperties_v5_t; -typedef ncclNetProperties_v4_t ncclNetProperties_t; +typedef ncclNetProperties_v5_t ncclNetProperties_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v5_t; + +typedef ncclNet_v5_t ncclNet_t; + +#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v5 + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v5_t; + +typedef ncclCollNet_v5_t ncclCollNet_t; + +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v5 + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA + int speed; // Port speed in Mbps. + int port; // Port number. + int maxComms; // Maximum number of comms we can create +} ncclNetProperties_v4_t; typedef struct { // Name of the network (mainly for logs) @@ -75,10 +179,6 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v4_t; -typedef ncclNet_v4_t ncclNet_t; - -#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4 - typedef struct { // Name of the collective network (mainly for logs) const char* name; @@ -117,8 +217,4 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v4_t; -typedef ncclCollNet_v4_t ncclCollNet_t; - -#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4 - #endif // end include guard diff --git a/src/include/net.h b/src/include/net.h index 10a2d85432..0cc50678b5 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -1,6 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,10 +9,14 @@ #include "nccl.h" #include "nccl_net.h" +#include "checks.h" extern ncclNet_t* ncclNet; typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; +ncclResult_t ncclNetInit(); +int ncclNetVersion(); + // Translation to external API static const char* ncclNetName() { return ncclNet->name; } static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; } @@ -23,60 +26,16 @@ static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCC static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; } static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; } static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; } -static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; } -static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; } -static ncclResult_t ncclNetIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, data, size, mhandle, request)); return ncclSuccess; } -static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; } +static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; } +static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; } +static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; } +static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; } static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; } // Test whether the current GPU support GPU Direct RDMA. -#define GPU_BUF_SIZE (2*1024*1024) -static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) { - int netDevs; - NCCLCHECK(ncclNetDevices(&netDevs)); - *gdrSupport = 0; - for (int dev=0; devgetProperties(dev, &props)); - if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - *gdrSupport = 1; - break; -#endif - - // Allocate memory on the GPU and try to register it on the NIC. - void *lComm = NULL, *sComm = NULL, *rComm = NULL; - ncclNetHandle_t handle; - void* gpuPtr = NULL; - void* mHandle = NULL; - ncclResult_t ret; - ncclDebugNoWarn = NCCL_NET; - NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1); - NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2); - NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3); - CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4); - if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { - NCCLCHECK(ncclNetDeregMr(sComm, mHandle)); - NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); - NCCLCHECK(ncclNetDeregMr(rComm, mHandle)); - *gdrSupport = 1; - } - ncclDebugNoWarn = 0; - CUDACHECK(hipFree(gpuPtr)); -cleanup4: - NCCLCHECK(ncclNetCloseRecv(rComm)); -cleanup3: - NCCLCHECK(ncclNetCloseSend(sComm)); -cleanup2: - NCCLCHECK(ncclNetCloseListen(lComm)); -cleanup1: - break; - } - return ncclSuccess; -} +ncclResult_t ncclGpuGdrSupport(int* gdrSupport); extern ncclNet_t ncclNetIb; extern ncclNet_t ncclNetSocket; diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h index 21ee82eaf8..29731dd835 100644 --- a/src/include/nvmlwrap.h +++ b/src/include/nvmlwrap.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,59 +9,13 @@ #include "nccl.h" -// The NVML library doesn't appear to be thread safe -#include -extern pthread_mutex_t nvmlLock; -#define NVMLLOCK() pthread_mutex_lock(&nvmlLock) -#define NVMLUNLOCK() pthread_mutex_unlock(&nvmlLock) +//#define NCCL_NVML_DIRECT 1 +#ifndef NCCL_NVML_DIRECT +#define NCCL_NVML_DIRECT 0 +#endif -#define NVMLLOCKCALL(cmd, ret) do { \ - NVMLLOCK(); \ - ret = cmd; \ - NVMLUNLOCK(); \ -} while(false) - -#define NVMLCHECK(cmd) do { \ - nvmlReturn_t e; \ - NVMLLOCKCALL(cmd, e); \ - if( e != NVML_SUCCESS ) { \ - WARN("NVML failure '%s'", nvmlErrorString(e)); \ - return ncclSystemError; \ - } \ -} while(false) - -//#define NVML_DIRECT 1 -#ifdef NVML_DIRECT +#if NCCL_NVML_DIRECT #include "nvml.h" - -static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; } -static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; } -static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; } -static ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { - NVMLCHECK(nvmlDeviceGetHandleByPciBusId(pciBusId, device)); - return ncclSuccess; -} -static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { - NVMLCHECK(nvmlDeviceGetIndex(device, index)); - return ncclSuccess; -} -static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { - NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive)); - return ncclSuccess; -} -static ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) { - NVMLCHECK(nvmlDeviceGetNvLinkRemotePciInfo(device, link, pci)); - return ncclSuccess; -} -static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult) { - NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult)); - return ncclSuccess; -} -static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { - NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor)); - return ncclSuccess; -} #else // Dynamically handle dependencies on NVML @@ -129,21 +83,56 @@ typedef struct nvmlPciInfo_st unsigned int reserved2; unsigned int reserved3; } nvmlPciInfo_t; + +/* P2P Capability Index Status*/ +typedef enum nvmlGpuP2PStatus_enum +{ + NVML_P2P_STATUS_OK = 0, + NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, + NVML_P2P_STATUS_GPU_NOT_SUPPORTED, + NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, + NVML_P2P_STATUS_DISABLED_BY_REGKEY, + NVML_P2P_STATUS_NOT_SUPPORTED, + NVML_P2P_STATUS_UNKNOWN +} nvmlGpuP2PStatus_t; + +/* P2P Capability Index*/ +typedef enum nvmlGpuP2PCapsIndex_enum +{ + NVML_P2P_CAPS_INDEX_READ = 0, + NVML_P2P_CAPS_INDEX_WRITE, + NVML_P2P_CAPS_INDEX_NVLINK, + NVML_P2P_CAPS_INDEX_ATOMICS, + NVML_P2P_CAPS_INDEX_PROP, + NVML_P2P_CAPS_INDEX_UNKNOWN +} nvmlGpuP2PCapsIndex_t; + /* End of nvml.h */ +#endif // NCCL_NVML_DIRECT -ncclResult_t wrapNvmlSymbols(void); +constexpr int ncclNvmlMaxDevices = 32; +struct ncclNvmlDeviceInfo { + nvmlDevice_t handle; + int computeCapabilityMajor, computeCapabilityMinor; +}; +struct ncclNvmlDevicePairInfo { + nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite; +}; +extern int ncclNvmlDeviceCount; +extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices]; +extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices]; -ncclResult_t wrapNvmlInit(void); -ncclResult_t wrapNvmlShutdown(void); -ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device); -ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index); -ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); -ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); -ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); -ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult); -ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor); - -#endif // NVML_DIRECT +// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly. +// Outsiders need only call it if they want to inspect the ncclNvml global +// tables above. +ncclResult_t ncclNvmlEnsureInitialized(); +ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device); +ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index); +ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); +ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); +ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); +ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); +ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor); +ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus); #endif // End include guard diff --git a/src/include/p2p.h b/src/include/p2p.h index 2519873c20..7430a722e7 100644 --- a/src/include/p2p.h +++ b/src/include/p2p.h @@ -12,16 +12,18 @@ struct ncclP2Pinfo { void* buff; ssize_t nbytes; + uint64_t opCount; }; typedef ncclRecyclableList ncclP2Plist; -static ncclResult_t ncclSaveP2pInfo(ncclP2Plist* &p2p, void* buff, ssize_t nBytes) { +static ncclResult_t ncclSaveP2pInfo(ncclP2Plist* &p2p, void* buff, ssize_t nBytes, uint64_t opCount) { if (p2p == NULL) p2p = new ncclP2Plist(); struct ncclP2Pinfo* next; NCCLCHECK(p2p->getNewElem(&next)); next->buff = buff; next->nbytes = nBytes; + next->opCount = opCount; return ncclSuccess; } #endif diff --git a/src/include/param.h b/src/include/param.h index ca992d71c8..ca243ca6fb 100644 --- a/src/include/param.h +++ b/src/include/param.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,77 +8,24 @@ #ifndef NCCL_PARAM_H_ #define NCCL_PARAM_H_ -#include -#include -#include -#include -#include +#include -static const char* userHomeDir() { - struct passwd *pwUser = getpwuid(getuid()); - return pwUser == NULL ? NULL : pwUser->pw_dir; -} +const char* userHomeDir(); +void setEnvFile(const char* fileName); +void initEnv(); -static void setEnvFile(const char* fileName) { - FILE * file = fopen(fileName, "r"); - if (file == NULL) return; +void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); - char *line = NULL; - char envVar[1024]; - char envValue[1024]; - size_t n = 0; - ssize_t read; - while ((read = getline(&line, &n, file)) != -1) { - if (line[read-1] == '\n') line[read-1] = '\0'; - int s=0; // Env Var Size - while (line[s] != '\0' && line[s] != '=') s++; - if (line[s] == '\0') continue; - strncpy(envVar, line, std::min(1023,s)); - envVar[s] = '\0'; - s++; - strncpy(envValue, line+s, 1023); - envValue[1023]='\0'; - setenv(envVar, envValue, 0); - } - if (line) free(line); - fclose(file); -} - -static void initEnv() { - char confFilePath[1024]; - const char * userDir = userHomeDir(); - if (userDir) { - sprintf(confFilePath, "%s/.nccl.conf", userDir); - setEnvFile(confFilePath); - } - sprintf(confFilePath, "/etc/nccl.conf"); - setEnvFile(confFilePath); -} - - -#define NCCL_PARAM(name, env, default_value) \ -pthread_mutex_t ncclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \ -int64_t ncclParam##name() { \ - static_assert(default_value != -1LL, "default value cannot be -1"); \ - static int64_t value = -1LL; \ - pthread_mutex_lock(&ncclParamMutex##name); \ - if (value == -1LL) { \ - value = default_value; \ - char* str = getenv("NCCL_" env); \ - if (str && strlen(str) > 0) { \ - errno = 0; \ - int64_t v = strtoll(str, NULL, 0); \ - if (errno) { \ - INFO(NCCL_ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \ - } else { \ - value = v; \ - INFO(NCCL_ALL,"%s set by environment to %lu.", "NCCL_" env, value); \ - } \ +#define NCCL_PARAM(name, env, deftVal) \ + int64_t ncclParam##name() { \ + constexpr int64_t uninitialized = INT64_MIN; \ + static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \ + static int64_t cache = uninitialized; \ + if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \ + ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \ } \ - } \ - pthread_mutex_unlock(&ncclParamMutex##name); \ - return value; \ -} + return cache; \ + } #define RCCL_PARAM(name, env, default_value) \ pthread_mutex_t rcclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \ diff --git a/src/include/profiler.h b/src/include/profiler.h new file mode 100644 index 0000000000..103af99adf --- /dev/null +++ b/src/include/profiler.h @@ -0,0 +1,37 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PROFILER_H_ +#define NCCL_PROFILER_H_ + +#include "proxy.h" + +enum ncclProxyProfileState { + ncclProxyProfileBegin = 0, + + ncclProxyProfileSendGPUWait = 1, + ncclProxyProfileSendWait = 2, + + ncclProxyProfileRecvWait = 1, + ncclProxyProfileRecvFlushWait = 2, + ncclProxyProfileRecvGPUWait = 3, + + ncclProxyProfileEnd = 4, + + ncclProxyProfileSleep = 8, + ncclProxyProfileWakeup = 9, + + ncclProxyProfileIdle = 16, + ncclProxyProfileActive = 17, + + ncclProxyProfileAppend = 24, + ncclProxyProfileAppendEnd = 25 +}; + +ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state); +void ncclProfilingDump(); + +#endif diff --git a/src/include/proxy.h b/src/include/proxy.h index 1cae10e533..1cf88d7b1c 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,27 +8,47 @@ #ifndef NCCL_PROXY_H_ #define NCCL_PROXY_H_ +#include "devcomm.h" +#include "info.h" +#include "socket.h" #include enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; struct ncclProxyArgs; -typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*); +typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclComm*, struct ncclProxyArgs*); #define NCCL_PROXY_MAX_SUBS MAXCHANNELS static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements"); -struct ncclProxySubArgs { - struct ncclChannel* channel; - struct ncclConnector* connector; +struct ncclProxyOp { + struct ncclProxyConnection* connection; + int channelId; int nsteps; - ssize_t sendbytes; - ssize_t recvbytes; - int sendChunkSize; - int recvChunkSize; - int delta; + ssize_t nbytes; + int root; + int next; - // Internal state + uint64_t opCount; + int sliceSteps; + int chunkSteps; + int chunkSize; + ncclDataType_t dtype; + ncclRedOp_t redOp; + ncclPattern_t pattern; // uint8_t + uint8_t protocol; + uint16_t connIndex; +}; +static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch"); + +struct ncclProxySubArgs { + struct ncclProxyConnection* connection; + int channelId; + int nsteps; + ssize_t nbytes; + int peer; + + int groupSize; // Number of consecutive sub operations sharing the same recvComm uint64_t base; uint64_t posted; uint64_t received; @@ -37,67 +57,128 @@ struct ncclProxySubArgs { uint64_t done; uint64_t end; void* requests[NCCL_STEPS]; + void* profilingEvents[NCCL_STEPS]; }; struct ncclProxyArgs { - proxyProgressFunc_t progress; struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS]; + proxyProgressFunc_t progress; int nsubs; int done; + uint64_t opCount; int sliceSteps; int chunkSteps; int chunkSize; - uint64_t opCount; - uint64_t commOpCount; - int protocol; ncclDataType_t dtype; ncclRedOp_t redOp; ncclPattern_t pattern; - int root; + uint8_t protocol; int state; char* sharedBuff[NCCL_STEPS]; int sharedSize[NCCL_STEPS]; int idle; uint64_t hdp_flushed; - uint8_t connIndex; - uint8_t sendIdx; - uint8_t recvIdx; // Element linking - pthread_mutex_t mutex; struct ncclProxyArgs* next; struct ncclProxyArgs* nextPeer; struct ncclProxyArgs** proxyAppendPtr; }; +#define NCCL_MAX_NETDEVS 128 -struct ncclProxySharedBuffers { +// ProxyOps are used to communicate between main thread and service thread +// Make sure we have enough to store two full rounds of operations on all channels. +// Otherwise we'd be unable to post half of them to free new elements. +#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P) +#define NCCL_MAX_LOCAL_RANKS 64 +struct ncclProxyOpsPool { + struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS]; + volatile int nextOps; + volatile int nextOpsEnd; + volatile int freeOps[NCCL_MAX_LOCAL_RANKS]; + pthread_mutex_t mutex; + pthread_cond_t cond; +}; + +struct ncclProxyOps { + ncclProxyOpsPool* pool; + int count; + int freeOp; + int nextOps; + int nextOpsEnd; +}; + +struct ncclProxySharedP2p { + int refcount; int size; char* cudaBuff; char* hostBuff; - struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv - // Collnet sharing is technically per device, but for now MAXDEVICES == MAXCHANNELS. - struct ncclProxyArgs* proxyAppendCollNet[2*MAXCHANNELS]; - void* collNetResources; + hipIpcMemHandle_t ipc; + struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv +}; + +struct ncclProxySharedCollNet { + int size; + char* cudaBuff; + char* hostBuff; + struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS]; + void* resources; +}; + +struct ncclProxyPeer { + struct ncclProxySharedP2p send; + struct ncclProxySharedP2p recv; +}; + +struct ncclSharedNetComms { + void* sendComm[MAXCHANNELS]; + void* recvComm[MAXCHANNELS]; + int sendRefCount[MAXCHANNELS]; + int recvRefCount[MAXCHANNELS]; }; struct ncclProxyPool; -struct ncclProxyState { - pthread_cond_t cond; - pthread_mutex_t opsMutex; - pthread_mutex_t poolMutex; - bool stop; - struct ncclProxySharedBuffers sharedBuffs; - struct ncclProxyArgs* ops; // Running operations, used by proxy thread - struct ncclProxyArgs* postedOps; // Posted operations, shared between proxy and main thread, locked with opsMutex - struct ncclProxyArgs* postedOpsEnd; - struct ncclProxyArgs* nextOps; // Pending operations, used by main thread (could still be cancelled) - struct ncclProxyArgs* nextOpsEnd; - struct ncclProxyArgs* pool; // Free operations for main thread - struct ncclProxyArgs* poolFreed; // Freed operations by the progress thread - struct ncclProxyArgs* poolReturned; // Shared between main and progress thread, lock with poolMutex +struct ncclProxyProgressState { + // Used by main threads to send work to progress thread + struct ncclProxyOpsPool* opsPool; + char opsPoolShmSuffix[6]; + pthread_t thread; + bool stop; + struct ncclProxyPeer** localPeers; + struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS]; + struct ncclProxySharedCollNet collNet; + struct ncclProxyArgs* active; + struct ncclProxyArgs* pool; struct ncclProxyPool* pools; + int nextOps; +}; + +struct ncclProxyState { + // Service thread + pthread_t thread; + struct ncclSocket* listenSock; + int stop; + + // Used by main thread + union ncclSocketAddress* peerAddresses; + struct ncclSocket* peerSocks; + struct ncclProxyOps* proxyOps; + void** sharedDevMems; + + // Progress thread + struct ncclProxyProgressState progressState; +}; + +struct ncclProxyConnection { + int send, transport, shared; + int localRank; + struct ncclSocket* sock; + struct ncclTransportComm* tcomm; + struct ncclProxyArgs *proxyAppend; + struct ncclProxyArgs **proxyAppendPtr; + void* transportResources; }; typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); @@ -108,26 +189,25 @@ enum proxyMode { proxyTo = 2 }; -ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks); -ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args); -ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args); +ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* proxyOp, int nranks); +ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp); +ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* proxyOp); ncclResult_t ncclProxyStart(struct ncclComm* comm); +ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses); ncclResult_t ncclProxyCreate(struct ncclComm* comm); +ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn); +enum ncclProxyMsgType { + ncclProxyMsgInit = 1, + ncclProxyMsgSharedInit = 2, + ncclProxyMsgSetup = 3, + ncclProxyMsgConnect = 4, + ncclProxyMsgStart = 5, + ncclProxyMsgClose = 6, + ncclProxyMsgAbort = 7, + ncclProxyMsgStop = 8 +}; + +ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize); ncclResult_t ncclProxyDestroy(struct ncclComm* comm); - -ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr); -ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr); -ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr); -ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm); - -#include - -// Spin wait until func evaluates to true -template -inline void transportProxyWait(const FUNC& func) { - while (!func()) { - sched_yield(); - } -} - +ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm); #endif diff --git a/src/include/shm.h b/src/include/shm.h index 0b93995089..08dc8495fd 100644 --- a/src/include/shm.h +++ b/src/include/shm.h @@ -1,6 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,65 +7,9 @@ #ifndef NCCL_SHM_H_ #define NCCL_SHM_H_ -#include -#include -#include -#include - -// Change functions behavior to match other SYS functions -static int shm_allocate(int fd, const int shmsize) { - int err = posix_fallocate(fd, 0, shmsize); - if (err) { errno = err; return -1; } - return 0; -} -static int shm_map(int fd, const int shmsize, void** ptr) { - *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - return (*ptr == MAP_FAILED) ? -1 : 0; -} - -static ncclResult_t shmSetup(const char* shmname, const int shmsize, int* fd, void** ptr, int create) { - SYSCHECKVAL(shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "shm_open", *fd); - if (create) SYSCHECK(shm_allocate(*fd, shmsize), "posix_fallocate"); - SYSCHECK(shm_map(*fd, shmsize, ptr), "mmap"); - close(*fd); - *fd = -1; - if (create) memset(*ptr, 0, shmsize); - return ncclSuccess; -} - -static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) { - int fd = -1; - void* ptr = MAP_FAILED; - ncclResult_t res = ncclSuccess; - - NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError); - CUDACHECKGOTO(hipHostRegister(ptr, shmsize, hipHostRegisterMapped), res, hipError); - CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError); - - *shmPtr = ptr; - return ncclSuccess; -sysError: - WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize); -hipError: - if (fd != -1) close(fd); - if (create) shm_unlink(shmname); - if (ptr != MAP_FAILED) munmap(ptr, shmsize); - *shmPtr = NULL; - return res; -} - -static ncclResult_t shmUnlink(const char* shmname) { - if (shmname != NULL) SYSCHECK(shm_unlink(shmname), "shm_unlink"); - return ncclSuccess; -} - -static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) { - CUDACHECK(hipHostUnregister(shmPtr)); - if (munmap(shmPtr, shmsize) != 0) { - WARN("munmap of shared memory failed"); - return ncclSystemError; - } - return ncclSuccess; -} +#include "nccl.h" +ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create); +ncclResult_t ncclShmUnlink(const char* shmname); +ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize); #endif diff --git a/src/include/socket.h b/src/include/socket.h index 2dbaaa9f36..d72480b6bb 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -1,6 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,14 +7,13 @@ #ifndef NCCL_SOCKET_H_ #define NCCL_SOCKET_H_ +#include "nccl.h" #include #include #include -#include #include -#include -#include -#include "utils.h" +#include +#include #define MAX_IFS 16 #define MAX_IF_NAME_SIZE 16 @@ -25,443 +23,48 @@ #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) /* Common socket address storage structure for IPv4/IPv6 */ -union socketAddress { +union ncclSocketAddress { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; -/* Format a string representation of a (union socketAddress *) socket address using getnameinfo() - * - * Output: "IPv4/IPv6 address" - */ -static inline const char *socketToString(union socketAddress *addr, char *buf) { - if (buf == NULL || addr == NULL) return NULL; - struct sockaddr *saddr = &addr->sa; - if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; } - char host[NI_MAXHOST], service[NI_MAXSERV]; - (void) getnameinfo(saddr, sizeof(union socketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV); - sprintf(buf, "%s<%s>", host, service); - return buf; -} +enum ncclSocketState { + ncclSocketConnecting = 0, + ncclSocketConnected = 1, + ncclSocketError = 2, + ncclSocketStateNum = 3 +} ; -static inline uint16_t socketToPort(union socketAddress *addr) { - struct sockaddr *saddr = &addr->sa; - return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port); -} +struct ncclSocket { + int fd; + union ncclSocketAddress addr; + volatile uint32_t* abortFlag; + int asyncFlag; + enum ncclSocketState state; +}; -/* Allow the user to force the IPv4/IPv6 interface selection */ -static inline int envSocketFamily(void) { - int family = -1; // Family selection is not forced, will use first one found - char* env = getenv("NCCL_SOCKET_FAMILY"); - if (env == NULL) - return family; - - INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env); - - if (strcmp(env, "AF_INET") == 0) - family = AF_INET; // IPv4 - else if (strcmp(env, "AF_INET6") == 0) - family = AF_INET6; // IPv6 - return family; -} - -static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) { -#ifdef ENABLE_TRACE - char line[SOCKET_NAME_MAXLEN+1]; -#endif - struct netIf userIfs[MAX_IFS]; - bool searchNot = prefixList && prefixList[0] == '^'; - if (searchNot) prefixList++; - bool searchExact = prefixList && prefixList[0] == '='; - if (searchExact) prefixList++; - int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); - - int found = 0; - struct ifaddrs *interfaces, *interface; - getifaddrs(&interfaces); - for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) { - if (interface->ifa_addr == NULL) continue; - - /* We only support IPv4 & IPv6 */ - int family = interface->ifa_addr->sa_family; - if (family != AF_INET && family != AF_INET6) - continue; - - TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, socketToString((union socketAddress *)interface->ifa_addr, line)); - - /* Allow the caller to force the socket family type */ - if (sock_family != -1 && family != sock_family) - continue; - - /* We also need to skip IPv6 loopback interfaces */ - if (family == AF_INET6) { - struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr); - if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue; - } - - // check against user specified interfaces - if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) { - continue; - } - - // Check that this interface has not already been saved - // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link - bool duplicate = false; - for (int i = 0; i < found; i++) { - if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; } - } - - if (!duplicate) { - // Store the interface name - strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize); - // Store the IP address - int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); - memcpy(addrs+found, interface->ifa_addr, salen); - found++; - } - } - - freeifaddrs(interfaces); - return found; -} - -static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) { - /* Check family first */ - int family = local_if.ifa_addr->sa_family; - if (family != remote->sa.sa_family) { - return false; - } - - if (family == AF_INET) { - struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr); - struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask); - struct sockaddr_in& remote_addr = remote->sin; - struct in_addr local_subnet, remote_subnet; - local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr; - remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr; - return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true; - } else if (family == AF_INET6) { - struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr); - struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask); - struct sockaddr_in6& remote_addr = remote->sin6; - struct in6_addr& local_in6 = local_addr->sin6_addr; - struct in6_addr& mask_in6 = mask->sin6_addr; - struct in6_addr& remote_in6 = remote_addr.sin6_addr; - bool same = true; - int len = 16; //IPv6 address is 16 unsigned char - for (int c = 0; c < len; c++) { //Network byte order is big-endian - char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c]; - char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c]; - if (c1 ^ c2) { - same = false; - break; - } - } - // At last, we need to compare scope id - // Two Link-type addresses can have the same subnet address even though they are not in the same scope - // For Global type, this field is 0, so a comparison wouldn't matter - same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id); - return same; - } else { - WARN("Net : Unsupported address family type"); - return false; - } -} - -static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) { -#ifdef ENABLE_TRACE - char line[SOCKET_NAME_MAXLEN+1]; -#endif - char line_a[SOCKET_NAME_MAXLEN+1]; - int found = 0; - struct ifaddrs *interfaces, *interface; - getifaddrs(&interfaces); - for (interface = interfaces; interface && !found; interface = interface->ifa_next) { - if (interface->ifa_addr == NULL) continue; - - /* We only support IPv4 & IPv6 */ - int family = interface->ifa_addr->sa_family; - if (family != AF_INET && family != AF_INET6) - continue; - - // check against user specified interfaces - if (!matchSubnet(*interface, remoteAddr)) { - continue; - } - - // Store the local IP address - int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); - memcpy(localAddrs+found, interface->ifa_addr, salen); - - // Store the interface name - strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); - - TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(localAddrs+found, line), socketToString(remoteAddr, line_a)); - found++; - if (found == maxIfs) break; - } - - if (found == 0) { - WARN("Net : No interface found in the same subnet as remote address %s", socketToString(remoteAddr, line_a)); - } - freeifaddrs(interfaces); - return found; -} - -static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char* ip_port_pair) { - if (!(ip_port_pair && strlen(ip_port_pair) > 1)) { - WARN("Net : string is null"); - return ncclInvalidArgument; - } - - bool ipv6 = ip_port_pair[0] == '['; - /* Construct the sockaddress structure */ - if (!ipv6) { - struct netIf ni; - // parse : string, expect one pair - if (parseStringList(ip_port_pair, &ni, 1) != 1) { - WARN("Net : No valid : pair found"); - return ncclInvalidArgument; - } - - struct addrinfo hints, *p; - int rv; - memset(&hints, 0, sizeof(hints)); - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_STREAM; - - if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) { - WARN("Net : error encountered when getting address info : %s", gai_strerror(rv)); - return ncclInvalidArgument; - } - - // use the first - if (p->ai_family == AF_INET) { - struct sockaddr_in& sin = ua->sin; - memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in)); - sin.sin_family = AF_INET; // IPv4 - //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address - sin.sin_port = htons(ni.port); // port - } else if (p->ai_family == AF_INET6) { - struct sockaddr_in6& sin6 = ua->sin6; - memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6)); - sin6.sin6_family = AF_INET6; // IPv6 - sin6.sin6_port = htons(ni.port); // port - sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete - sin6.sin6_scope_id = 0; // should be global scope, set to 0 - } else { - WARN("Net : unsupported IP family"); - return ncclInvalidArgument; - } - - freeaddrinfo(p); // all done with this structure - - } else { - int i, j = -1, len = strlen(ip_port_pair); - for (i = 1; i < len; i++) { - if (ip_port_pair[i] == '%') j = i; - if (ip_port_pair[i] == ']') break; - } - if (i == len) { - WARN("Net : No valid [IPv6]:port pair found"); - return ncclInvalidArgument; - } - bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope - - char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ]; - memset(ip_str, '\0', sizeof(ip_str)); - memset(port_str, '\0', sizeof(port_str)); - memset(if_name, '\0', sizeof(if_name)); - strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1); - strncpy(port_str, ip_port_pair+i+2, len-i-1); - int port = atoi(port_str); - if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name - - struct sockaddr_in6& sin6 = ua->sin6; - sin6.sin6_family = AF_INET6; // IPv6 - inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address - sin6.sin6_port = htons(port); // port - sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete - sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope - } - return ncclSuccess; -} - -static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) { - static int shownIfName = 0; - int nIfs = 0; - // Allow user to force the INET socket family selection - int sock_family = envSocketFamily(); - // User specified interface - char* env = getenv("NCCL_SOCKET_IFNAME"); - if (env && strlen(env) > 1) { - INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env); - // Specified by user : find or fail - if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env); - nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); - } else { - // Try to automatically pick the right one - // Start with IB - nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); - // else see if we can get some hint from COMM ID - if (nIfs == 0) { - char* commId = getenv("NCCL_COMM_ID"); - if (commId && strlen(commId) > 1) { - INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId); - // Try to find interface that is in the same subnet as the IP in comm id - union socketAddress idAddr; - GetSocketAddrFromString(&idAddr, commId); - nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs); - } - } - // Then look for anything else (but not docker or lo) - if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); - // Finally look for docker, then lo. - if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); - if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); - } - return nIfs; -} - -static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) { - /* IPv4/IPv6 support */ - int family = localAddr->sa.sa_family; - int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); - - /* Create socket and bind it to a port */ - int sockfd = socket(family, SOCK_STREAM, 0); - if (sockfd == -1) { - WARN("Net : Socket creation failed : %s", strerror(errno)); - return ncclSystemError; - } - -#if defined(RCCL_IB_TEST) - static int port = 23456; - localAddr->sin.sin_port = htons(port++); -#endif - - if (socketToPort(localAddr)) { - // Port is forced by env. Make sure we get the port. - int opt = 1; -#if defined(SO_REUSEPORT) - SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt"); -#else - SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt"); -#endif - } - - // localAddr port should be 0 (Any port) - SYSCHECK(bind(sockfd, &localAddr->sa, salen), "bind"); - - /* Get the assigned Port */ - socklen_t size = salen; - SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname"); - -#ifdef ENABLE_TRACE - char line[SOCKET_NAME_MAXLEN+1]; - TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(localAddr, line)); -#endif - - /* Put the socket in listen mode - * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn - */ - SYSCHECK(listen(sockfd, 16384), "listen"); - *fd = sockfd; - return ncclSuccess; -} - -static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) { - char line[SOCKET_NAME_MAXLEN+1]; - /* IPv4/IPv6 support */ - int family = remoteAddr->sa.sa_family; - if (family != AF_INET && family != AF_INET6) { - WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)", - socketToString(remoteAddr, line), family, AF_INET, AF_INET6); - return ncclInternalError; - } - int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); - - /* Connect to a hostname / port */ - *fd = socket(family, SOCK_STREAM, 0); - if (*fd == -1) { - WARN("Net : Socket creation failed : %s", strerror(errno)); - return ncclSystemError; - } - - const int one = 1; - SYSCHECK(setsockopt(*fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); - - /* const int bufsize = 128*1024; - SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt"); - SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/ - - TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(remoteAddr, line)); - - int ret; - int timedout_retries = 0; - int refused_retries = 0; -retry: - SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret); - if (ret == 0) return ncclSuccess; - if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) { - if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || - (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) { - if (refused_retries % 1000 == 0) INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); - usleep(SLEEP_INT); - goto retry; - } - } - WARN("Net : Connect to %s failed : %s", socketToString(remoteAddr, line), strerror(errno)); - return ncclSystemError; -} +const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); +ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); +int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); +int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); +// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call +ncclResult_t ncclSocketListen(struct ncclSocket* sock); +// Connect to sock->addr. sock->fd is set after a successful call. +ncclResult_t ncclSocketConnect(struct ncclSocket* sock); +// Return socket connection state. +ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state); +// Accept an incoming connection from listenSocket->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr. +ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket); #define NCCL_SOCKET_SEND 0 #define NCCL_SOCKET_RECV 1 -static ncclResult_t socketProgressOpt(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset, int block) { - int bytes = 0; - char* data = (char*)ptr; - char line[SOCKET_NAME_MAXLEN+1]; - do { - if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); - if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); - if (op == NCCL_SOCKET_RECV && bytes == 0) { - WARN("Net : Connection closed by remote peer %s", socketToString(addr, line)); - return ncclSystemError; - } - if (bytes == -1) { - if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { - WARN("Net : Call to recv from %s failed : %s", socketToString(addr, line), strerror(errno)); - return ncclSystemError; - } else { - bytes = 0; - } - } - (*offset) += bytes; - } while (bytes > 0 && (*offset) < size); - return ncclSuccess; -} - -static ncclResult_t socketProgress(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset) { - return socketProgressOpt(op, fd, addr, ptr, size, offset, 0); -} - -static ncclResult_t socketWait(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset) { - while (*offset < size) - NCCLCHECK(socketProgressOpt(op, fd, addr, ptr, size, offset, 1)); - return ncclSuccess; -} - -static ncclResult_t socketSend(int fd, union socketAddress *addr, void* ptr, int size) { - int offset = 0; - NCCLCHECK(socketWait(NCCL_SOCKET_SEND, fd, addr, ptr, size, &offset)); - return ncclSuccess; -} - -static ncclResult_t socketRecv(int fd, union socketAddress *addr, void* ptr, int size) { - int offset = 0; - NCCLCHECK(socketWait(NCCL_SOCKET_RECV, fd, addr, ptr, size, &offset)); - return ncclSuccess; -} +ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); +ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); +ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size); +ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); +ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed); +/* initialize a socket. */ +ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0); #endif diff --git a/src/include/timer.h b/src/include/timer.h new file mode 100644 index 0000000000..284fec6e05 --- /dev/null +++ b/src/include/timer.h @@ -0,0 +1,60 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TIMER_H_ +#define NCCL_TIMER_H_ +#if ENABLE_TIMER +#include +#include +#include +static double freq = -1; +static void calibrate() { + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t timeCycles = __rdtsc(); + double time = - tv.tv_sec*1E6 - tv.tv_usec; + uint64_t total = 0ULL; + for (int i=0; i<10000; i++) total += __rdtsc(); + gettimeofday(&tv, NULL); + timeCycles = __rdtsc() - timeCycles; + time += tv.tv_sec*1E6 + tv.tv_usec; + freq = timeCycles/time; +} +static inline double gettime() { + if (freq == -1) calibrate(); + return __rdtsc()/freq; +} +static uint64_t counts[8]; +static double times[8]; +static double startTimes[8]; +#define TIME_START(index) do { \ + counts[index]++; \ + startTimes[index] = gettime(); \ +} while (0); + +#define TIME_STOP(index) do { \ + times[index] += gettime() - startTimes[index]; \ +} while (0); + +#define TIME_CANCEL(index) do { \ + counts[index]--; \ +} while (0); + +#define TIME_PRINT(name) do { \ + printf("%s stats", name); \ + for (int i=0; i<8; i++) { \ + if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \ + counts[i] = 0; \ + } \ + printf("\n"); \ +} while (0); +#else +#define TIME_START(index) while(0); +#define TIME_STOP(index) while(0); +#define TIME_CANCEL(index) while(0); +#define TIME_PRINT(name) +#endif +#endif diff --git a/src/include/transport.h b/src/include/transport.h index bd18ac4b0d..25675cf79c 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -1,5 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,12 +12,14 @@ #include "graph.h" #include "nvmlwrap.h" #include "core.h" -#include "proxy.h" -#define NTRANSPORTS 3 +#define NTRANSPORTS 4 #define TRANSPORT_P2P 0 #define TRANSPORT_SHM 1 #define TRANSPORT_NET 2 +#define TRANSPORT_COLLNET 3 + +#include "proxy.h" extern struct ncclTransport ncclTransports[]; @@ -28,12 +31,15 @@ struct ncclComm; struct ncclPeerInfo { int rank; int cudaDev; + int netDev; int gdrSupport; bool hasFineGrain; uint64_t hostHash; uint64_t pidHash; dev_t shmDev; int64_t busId; + struct ncclComm* comm; + int cudaCompCap; }; #define CONNECT_SIZE 128 @@ -44,8 +50,12 @@ struct ncclConnect { struct ncclTransportComm { ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex); ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*); - ncclResult_t (*free)(void*); - ncclResult_t (*proxy)(struct ncclProxyArgs*); + ncclResult_t (*free)(struct ncclConnector*); + ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels); + ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); + ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); + ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm); + ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*); }; struct ncclTransport { diff --git a/src/include/utils.h b/src/include/utils.h index 739a774e14..f08ff3731d 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,6 +8,7 @@ #define NCCL_UTILS_H_ #include "nccl.h" +#include "checks.h" #include int ncclCudaCompCap(); @@ -94,6 +95,11 @@ class ncclRecyclableList { return rv; } + T* peakNext() { + if (cursor == NULL || cursor == tail) return NULL; + return &cursor->data; + } + // Recycle the list without freeing the space void recycle() { tail = cursor = head; diff --git a/src/init.cc b/src/init.cc index 60d74c8867..317b58bcd0 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -30,8 +30,8 @@ #include "graph/topo.h" // [RCCL] -#include "clique/CliqueManager.h" -#include +//#include "clique/CliqueManager.h" +//#include // [/RCCL] #define STR2(v) #v @@ -56,93 +56,8 @@ const char *ncclTypeStr[ncclNumTypes] = {"_i8", "_u8", "_i32", "_u32", "_i64", " NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); - -ncclNet_t* ncclNet = NULL; -ncclCollNet_t* ncclCollNet = NULL; - struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {}; -// Returns ncclInternalError if anything fails, causing that network to be ignored. -ncclResult_t initNet(ncclNet_t* net) { - int ndev; - if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError; - if (net->devices(&ndev) != ncclSuccess) return ncclInternalError; - if (ndev <= 0) return ncclSystemError; - return ncclSuccess; -} - -ncclResult_t initCollNet(ncclCollNet_t* collnet) { - int ndev; - if (collnet->init(ncclDebugLog) != ncclSuccess) return ncclInternalError; - if (collnet->devices(&ndev) != ncclSuccess) return ncclInternalError; - if (ndev <= 0) return ncclSystemError; - return ncclSuccess; -} - -ncclResult_t initNetPlugin(ncclNet_t** net, ncclCollNet_t** collnet) { - char ncclNetPluginName[128]; - const char* envPluginName = getenv("NCCL_NET_PLUGIN"); - if (envPluginName && strlen(envPluginName)) { - snprintf(ncclNetPluginName, 128, "librccl-net-%s.so", envPluginName); - INFO(NCCL_INIT, "Plugin name set by env to %s\n", ncclNetPluginName); - } else { - sprintf(ncclNetPluginName, "librccl-net.so"); - } - void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL); - if (netPluginLib == NULL) { - // dlopen does not guarantee to set errno, but dlerror only gives us a - // string, so checking errno doesn't hurt to try to provide a better - // error message - if (errno == ENOENT) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName); - } else { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror()); - } - return ncclSuccess; - } - *net = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL)); - if (*net == NULL) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol."); - if (netPluginLib != NULL) dlclose(netPluginLib); - return ncclSuccess; - } - // Check for CollNet - *collnet = (ncclCollNet_t*) dlsym(netPluginLib, STR(NCCL_COLLNET_PLUGIN_SYMBOL)); - if (*collnet == NULL) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_COLLNET_PLUGIN_SYMBOL) " symbol."); - } - return ncclSuccess; -} - -ncclResult_t initNet() { - // Always initialize bootstrap network - NCCLCHECK(bootstrapNetInit()); - - // Initialize main communication network - ncclNet_t* nets[3] = { NULL, &ncclNetIb, &ncclNetSocket }; - ncclCollNet_t* collNets[3] = { NULL, NULL, NULL }; - NCCLCHECK(initNetPlugin(nets+0, collNets+0)); - char* netName = getenv("NCCL_NET"); - - for (int i=0; i<3; i++) { - if (nets[i] == NULL) continue; - if (netName && strcmp(netName, nets[i]->name) != 0) continue; - // net plugin is already initialized - if (initNet(nets[i]) != ncclSuccess) continue; - ncclNet = nets[i]; - if (collNets[i] && initCollNet(collNets[i]) == ncclSuccess) { - ncclCollNet = collNets[i]; - } - break; - } - - if (ncclNet == NULL) { - WARN("Error: network %s not found.", netName ? netName : ""); - return ncclInvalidUsage; - } - return ncclSuccess; -} - // GDRCOPY support: Off by default NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0); @@ -156,7 +71,8 @@ ncclResult_t initGdrCopy() { return ncclSuccess; } -NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0); + +NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0); pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; static bool initialized = false; @@ -168,7 +84,9 @@ static ncclResult_t ncclInit() { initEnv(); initGdrCopy(); maxLocalSizeBytes = ncclKernMaxLocalSize(); - NCCLCHECK(initNet()); + int carveout = ncclParamL1SharedMemoryCarveout(); + if (carveout) ncclKernSetSharedMemoryCarveout(carveout); + NCCLCHECK(ncclNetInit()); INFO(NCCL_INIT, "Using network %s", ncclNetName()); initialized = true; } @@ -208,7 +126,7 @@ void *ncclCommThreadMain(void *arg) { ncclComm_t comm = (ncclComm_t)arg; int head = comm->hostDevComm.collTraceHead; #define MAX_NAME_LENGTH 64 - char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+1)); + char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+2)); for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) { for (int al = 0; al < NCCL_NUM_ALGORITHMS; al++) { for (int type = 0; type < ncclNumTypes; type++) { @@ -228,6 +146,8 @@ void *ncclCommThreadMain(void *arg) { } char* line = func_names+MAX_NAME_LENGTH*FUNC_INDEX_P2P; sprintf(line, "SendRecvRingSimpleSum_i8"); + line += MAX_NAME_LENGTH; + sprintf(line, "AllToAllPivotRingSimpleSum_i8"); do { int tail = LOAD(comm->hostDevComm.collTraceTail)%COLLTRACE_NUM_ITEMS; int count; @@ -257,40 +177,43 @@ void *ncclCommThreadMain(void *arg) { (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, fIdx, td->data_0, td->opCount, td->data_1); } else { - sprintf(line, "## [%12.6f] [%02d:%02d] %06lx", - (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, fIdx == FUNC_INDEX_P2P ? (td->opCount + 0x100000): td->opCount); + if (fIdx == FUNC_INDEX_P2P || type == ncclCollTraceP2pElemType) + sprintf(line, "## [%12.6f] [%02d:%02d] %06x-%06x", (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->p2pOpCount[0], td->p2pOpCount[1]); + else + sprintf(line, "## [%12.6f] [%02d:%02d] %06lx", (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->opCount); offset = strlen(line); - switch (type) { - case ncclCollTraceKernelLaunchType: - sprintf(line+offset, " KL HWID %8x %s ", - td->data_0, func_names+MAX_NAME_LENGTH*fIdx); - offset = strlen(line); - if (fIdx > FUNC_INDEX_P2P) - sprintf(line+offset, "ERROR bad function index %d", fIdx); - else if (fIdx == FUNC_INDEX_P2P) - sprintf(line+offset, "nt %d dt %d busId %lx nRanks %d", td->p2p.nThreads, td->p2p.delta, comm->busId, comm->nRanks); - else - sprintf(line+offset, "nt %d bi %d nc %d busId %lx nRanks %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks); - break; - case ncclCollTraceCollEndType: - sprintf(line+offset, " CE %s ", func_names+MAX_NAME_LENGTH*fIdx); - offset = strlen(line); - if (fIdx > FUNC_INDEX_P2P) - sprintf(line+offset, "ERROR bad function index %d", fIdx); - else if (fIdx == FUNC_INDEX_P2P) - sprintf(line+offset, "nt %d dt %d busId %lx nRanks %d", td->p2p.nThreads, td->p2p.delta, comm->busId, comm->nRanks); - else - sprintf(line+offset, "nt %d bi %d nc %d busId %lx nRanks %d", td->coll.nThreads, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks); - break; - case ncclCollTraceKernelEndType: - sprintf(line+offset, " KE busId %lx nRanks %d", comm->busId, comm->nRanks); - break; - case ncclCollTraceAbortType: - sprintf(line+offset, " Abort"); - break; - default: - sprintf(line+offset, " unknown collective trace data type"); - break; + if (type == ncclCollTraceCollElemType) { + sprintf(line+offset, " CE %s nw %d bi %d nc %d busId %lx nRanks %d", func_names+MAX_NAME_LENGTH*fIdx, td->coll.nWarps, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks); + } else if (type == ncclCollTraceP2pElemType) { + sprintf(line+offset, " PE %s %d -> %d/%d/%d/%d conn/nw/ws/ng %d/%d/%d/%d -> %d busId %lx nRanks %d", func_names+MAX_NAME_LENGTH*fIdx, + td->p2p[0].peer, td->p2p[0].connIndex, td->p2p[0].nWarps, td->p2p[0].warpStart, td->p2p[0].ngroups, + td->p2p[1].connIndex, td->p2p[1].nWarps, td->p2p[1].warpStart, td->p2p[1].ngroups, td->p2p[1].peer, comm->busId, comm->nRanks); + } else { + switch (type&0xf) { + case ncclCollTraceKernelLaunchType: + case ncclCollTraceCollLaunchType: + if ((type&0xf) == ncclCollTraceKernelLaunchType) + sprintf(line+offset, " KL HWID %8x %s", td->data_0, func_names+MAX_NAME_LENGTH*fIdx); + else if ((type&0xf) == ncclCollTraceCollLaunchType) + sprintf(line+offset, " CL %s", func_names+MAX_NAME_LENGTH*fIdx); + offset = strlen(line); + if ((type&0xf0) == ncclCollTraceCollElemType) + sprintf(line+offset, " nw %d bi %d nc %d busId %lx nRanks %d", td->coll.nWarps, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks); + else if ((type&0xf0) == ncclCollTraceP2pElemType) + sprintf(line+offset, " %d -> %d/%d/%d/%d conn/nw/ws/ng %d/%d/%d/%d -> %d busId %lx nRanks %d", + td->p2p[0].peer, td->p2p[0].connIndex, td->p2p[0].nWarps, td->p2p[0].warpStart, td->p2p[0].ngroups, + td->p2p[1].connIndex, td->p2p[1].nWarps, td->p2p[1].warpStart, td->p2p[1].ngroups, td->p2p[1].peer, comm->busId, comm->nRanks); + break; + case ncclCollTraceKernelEndType: + sprintf(line+offset, " KE busId %lx nRanks %d", comm->busId, comm->nRanks); + break; + case ncclCollTraceAbortType: + sprintf(line+offset, " Abort"); + break; + default: + sprintf(line+offset, " unknown collective trace data type"); + break; + } } } INFO(NCCL_COLL, "%s", line); @@ -311,6 +234,9 @@ static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; + // First stop all threads before we free anything. + NCCLCHECK(ncclProxyDestroy(comm)); + delete[] comm->userRedOps; free(comm->connectSend); @@ -324,96 +250,38 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->asyncOps); #ifdef ENABLE_PROFILING -#ifdef ENABLE_TIMING_PROFILE - struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf)); - CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf), hipMemcpyDeviceToHost)); + struct ncclProf prof; + prof.elems = (struct ncclProfElem*)malloc(sizeof(struct ncclProfElem)*PROFILE_NUM_ITEMS); + CUDACHECK(hipMemcpy(prof.elems, comm->hostDevComm.devProf.elems, sizeof(struct ncclProfElem)*PROFILE_NUM_ITEMS, hipMemcpyDeviceToHost)); #define VEGA_GPU_RTC_FREQUENCY 2.5E7 if (comm->rank == 0) { - INFO(NCCL_INIT, "# %8s %7s %7s %7s %7s %7s %7s %7s %7s %7s", "Rank:Ch", "total", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS"); - INFO(NCCL_INIT, "# %8s %7s %7s %7s %7s %7s %7s %7s %7s %7s", "", "(ms)", "(ms)", "(ms)", "(ms)", "(ms)", "(ms)", "(ms)", "(ms)", "(ms)"); + INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank:Ch", "opCt", "total", " wait", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS"); + INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "", " (s)", " (s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)"); } - for (int chan=0; channChannels; chan++) { - INFO(NCCL_INIT, "# [%03d:%02d] %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f", - comm->rank, chan, (double)prof->elems[chan].total_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0, - (double)prof->elems[chan].send_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0, - (double)prof->elems[chan].recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0, - (double)prof->elems[chan].directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0, - (double)prof->elems[chan].directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0, - (double)prof->elems[chan].directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0, - (double)prof->elems[chan].copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0, - (double)prof->elems[chan].recv_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0, - (double)prof->elems[chan].recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1000.0); + for (int i = 1; i < PROFILE_NUM_ITEMS; i++) { + int valid = 0; + for (int chan=0; channChannels; chan++) { + struct ncclProfElem *elem = prof.elems+i; + if (elem->elem[chan].opCount == 0) + continue; + valid++; + INFO(NCCL_INIT, "# [%02d:%02d] %04d %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f", + comm->rank, chan, (uint32_t)elem->elem[chan].opCount, (double)elem->elem[chan].total_cycle/VEGA_GPU_RTC_FREQUENCY, + (double)elem->elem[chan].wait_cycle/VEGA_GPU_RTC_FREQUENCY, + (elem->elem[chan].send_cycle) ? (double)elem->elem[chan].send_byte/((double)elem->elem[chan].send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (elem->elem[chan].recvReduceSend_cycle) ? (double)elem->elem[chan].recvReduceSend_byte/((double)elem->elem[chan].recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (elem->elem[chan].directRecvReduceCopySend_cycle) ? (double)elem->elem[chan].directRecvReduceCopySend_byte/((double)elem->elem[chan].directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (elem->elem[chan].directRecvCopySend_cycle) ? (double)elem->elem[chan].directRecvCopySend_byte/((double)elem->elem[chan].directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (elem->elem[chan].directRecv_cycle) ? (double)elem->elem[chan].directRecv_byte/((double)elem->elem[chan].directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (elem->elem[chan].copySend_cycle) ? (double)elem->elem[chan].copySend_byte/((double)elem->elem[chan].copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (elem->elem[chan].recv_cycle) ? (double)elem->elem[chan].recv_byte/((double)elem->elem[chan].recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (elem->elem[chan].recvCopySend_cycle) ? (double)elem->elem[chan].recvCopySend_byte/((double)elem->elem[chan].recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0); + } + if (valid == 0) + break; } - free(prof); - CUDACHECK(hipFree(comm->hostDevComm.devProf)); - - for (int channel=0; channelnChannels, comm->p2pnChannels); channel++) { - if (comm->channels[channel].send_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Send %7.3f ms (%ld bytes %d measurements)", - comm->rank, channel, (float)comm->channels[channel].bw_cumulative, - comm->channels[channel].send_byte, comm->channels[channel].bw_count); - if (comm->channels[channel].recv_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Recv %7.3f ms (%ld bytes %d measurements)", - comm->rank, channel, (float)comm->channels[channel].bw_cumulative, - comm->channels[channel].recv_byte, comm->channels[channel].bw_count); - } -#else - struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf)); - CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf), hipMemcpyDeviceToHost)); - uint64_t total_cycle = 0, wait_cycle = 0, wait_send_cycle = 0, wait_recv_cycle = 0, send_cycle = 0, directSend_cycle = 0, recv_cycle = 0, \ - directRecv_cycle = 0, copySend_cycle = 0, directCopySend_cycle = 0, recvCopySend_cycle = 0, directRecvCopySend_cycle = 0, \ - recvReduceCopy_cycle = 0, recvReduceSend_cycle = 0, recvReduceCopySend_cycle = 0, directRecvReduceCopySend_cycle = 0, \ - send_byte = 0, directSend_byte = 0, recv_byte = 0, directRecv_byte = 0, copySend_byte = 0, directCopySend_byte = 0, \ - recvCopySend_byte = 0, directRecvCopySend_byte = 0, recvReduceCopy_byte = 0, recvReduceSend_byte = 0, \ - recvReduceCopySend_byte = 0, directRecvReduceCopySend_byte = 0; - for (int chan=0; channChannels; chan++) { - total_cycle += prof->elems[chan].total_cycle; - wait_cycle += prof->elems[chan].wait_cycle; - wait_send_cycle += prof->elems[chan].wait_send_cycle; - wait_recv_cycle += prof->elems[chan].wait_recv_cycle; - send_cycle += prof->elems[chan].send_cycle; - directSend_cycle += prof->elems[chan].directSend_cycle; - recv_cycle += prof->elems[chan].recv_cycle; - directRecv_cycle += prof->elems[chan].directRecv_cycle; - copySend_cycle += prof->elems[chan].copySend_cycle; - directCopySend_cycle += prof->elems[chan].directCopySend_cycle; - recvCopySend_cycle += prof->elems[chan].recvCopySend_cycle; - directRecvCopySend_cycle += prof->elems[chan].directRecvCopySend_cycle; - recvReduceCopy_cycle += prof->elems[chan].recvReduceCopy_cycle; - recvReduceSend_cycle += prof->elems[chan].recvReduceSend_cycle; - recvReduceCopySend_cycle += prof->elems[chan].recvReduceCopySend_cycle; - directRecvReduceCopySend_cycle += prof->elems[chan].directRecvReduceCopySend_cycle; - send_byte += prof->elems[chan].send_byte; - directSend_byte += prof->elems[chan].directSend_byte; - recv_byte += prof->elems[chan].recv_byte; - directRecv_byte += prof->elems[chan].directRecv_byte; - copySend_byte += prof->elems[chan].copySend_byte; - directCopySend_byte += prof->elems[chan].directCopySend_byte; - recvCopySend_byte += prof->elems[chan].recvCopySend_byte; - directRecvCopySend_byte += prof->elems[chan].directRecvCopySend_byte; - recvReduceCopy_byte += prof->elems[chan].recvReduceCopy_byte; - recvReduceSend_byte += prof->elems[chan].recvReduceSend_byte; - recvReduceCopySend_byte += prof->elems[chan].recvReduceCopySend_byte; - directRecvReduceCopySend_byte += prof->elems[chan].directRecvReduceCopySend_byte; - } - #define VEGA_GPU_RTC_FREQUENCY 2.5E7 - if (comm->rank == 0) { - INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank", "total", " wait", "w_send", "w_recv", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS"); - INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)"); - } - INFO(NCCL_INIT, "# %4d %6.4f %6.4f %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f", - comm->rank, (double)total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels, - (double)wait_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels, - (double)wait_send_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels, - (double)wait_recv_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels, - (send_cycle) ? (double)send_byte*comm->nChannels/((double)send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (recvReduceSend_cycle) ? (double)recvReduceSend_byte*comm->nChannels/((double)recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (directRecvReduceCopySend_cycle) ? (double)directRecvReduceCopySend_byte*comm->nChannels/((double)directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (directRecvCopySend_cycle) ? (double)directRecvCopySend_byte*comm->nChannels/((double)directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (directRecv_cycle) ? (double)directRecv_byte*comm->nChannels/((double)directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (copySend_cycle) ? (double)copySend_byte*comm->nChannels/((double)copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (recv_cycle) ? (double)recv_byte*comm->nChannels/((double)recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (recvCopySend_cycle) ? (double)recvCopySend_byte*comm->nChannels/((double)recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0); - free(prof); - CUDACHECK(hipFree(comm->hostDevComm.devProf)); + free(prof.elems); + CUDACHECK(hipFree(comm->hostDevComm.devProf.elems)); for (int channel=0; channelnChannels, comm->p2pnChannels); channel++) { if (comm->channels[channel].send_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Send %6.2f GB/s (%ld bytes %d measurements)", @@ -426,7 +294,6 @@ static ncclResult_t commFree(ncclComm_t comm) { comm->channels[channel].recv_byte, comm->channels[channel].bw_count); } #endif -#endif #ifdef ENABLE_COLLTRACE STORE(&comm->hostDevComm.collTraceExit, 1); @@ -437,6 +304,10 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->peerInfo); ncclTopoFree(comm->topo); + for (int n=0; nnNodes; n++) free(comm->nodeRanks[n].localRankToRank); + free(comm->nodeRanks); + free(comm->rankToNode); + free(comm->rankToLocalRank); if (comm->bootstrap) NCCLCHECK(bootstrapClose(comm->bootstrap)); @@ -460,8 +331,16 @@ static ncclResult_t commFree(ncclComm_t comm) { int isLast; NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); if (isLast) { + // Wait for all service threads to be done. We could not + // do it earlier because it could have blocked and prevented + // other ranks in the process to call ncclCommDestroy + for (int i=0; iintraRanks; i++) { + void* ret; + if (comm->intraThreads[i]) pthread_join(comm->intraThreads[i], &ret); + } free(comm->intraBarrier); free(comm->intraParams); + free(comm->intraThreads); free(comm->intraCudaDevs); free(comm->intraCGMode); free(comm->intraCC); @@ -525,9 +404,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { comm->collOpCount = 0; comm->p2pOpCount = 0; - comm->argsptr = &comm->args; + comm->argsptrs[0] = &comm->devComm; + comm->argsptrs[1] = &comm->args; #ifdef ENABLE_PROFILING - NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf, 1)); + NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf.elems, PROFILE_NUM_ITEMS)); #endif #ifdef ENABLE_COLLTRACE @@ -546,11 +426,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { comm->asyncOpCount = 0; comm->asyncTotalSize = 0; comm->channelSize = ncclParamAggChannelSize(); - comm->asyncAllocMode = ncclComm::ROUND_ROBIN; + comm->asyncAllocMode = ncclComm::SHORTEST_QUEUE; char* str = getenv("NCCL_AGG_ALLOC_MODE"); if (str) INFO(NCCL_ENV, "NCCL_AGG_ALLOC_MODE set by environment to %s", str); - if (str && strcmp(str, "SHORTEST_QUEUE") == 0) { - comm->asyncAllocMode = ncclComm::SHORTEST_QUEUE; + if (str && strcmp(str, "ROUND_ROBIN") == 0) { + comm->asyncAllocMode = ncclComm::ROUND_ROBIN; } CUDACHECK(hipDriverGetVersion(&comm->driverVersion)); @@ -577,13 +457,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks)); NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks)); - // Create a map between global rank and intra-node rank - NCCLCHECK(ncclCalloc(&comm->rankToIntraNodeRank, comm->nRanks)); - memset(comm->rankToIntraNodeRank, -1, comm->nRanks*sizeof(comm->rankToIntraNodeRank[0])); - // Mark channels as non initialized. for (int c=0; cchannels[c].id = -1; + CUDACHECK(hipDeviceGetAttribute(&comm->WarpSize, hipDeviceAttributeWarpSize, comm->cudaDev)); *comret = comm; return ncclSuccess; } @@ -652,6 +529,8 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->gdrSupport = 0; } + info->comm = comm; + info->cudaCompCap = ncclCudaCompCap(); return ncclSuccess; } @@ -681,7 +560,7 @@ void* waitForNonNullPtr(void* p) { ncclResult_t initParams(struct ncclComm* comm) { hipLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank; - params->args = (void **)&comm->argsptr; + params->args = (void **)&comm->argsptrs; params->stream = NULL; params->sharedMem = 0; params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1; @@ -703,6 +582,7 @@ ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, st bar[0] = bar[1] = 0; comm->intraBarrier = bar; NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks)); + NCCLCHECK(ncclCalloc(&comm->intraThreads, comm->intraRanks)); NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks)); int* CGMode; NCCLCHECK(ncclCalloc(&CGMode, 1)); @@ -715,11 +595,13 @@ ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, st } else { comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier); comm->intraParams = (hipLaunchParams*)waitForNonNullPtr(&comm0->intraParams); + comm->intraThreads = (pthread_t*)waitForNonNullPtr(&comm0->intraThreads); comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs); comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode); comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC); } comm->intraCudaDevs[comm->intraRank] = comm->cudaDev; + comm->intraThreads[comm->intraRank] = comm->proxyState.thread; NCCLCHECK(initParams(comm)); int cgMdLaunch = 1; @@ -777,10 +659,9 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { return ncclSuccess; } -NCCL_PARAM(CrossNic, "CROSS_NIC", 2); NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0); NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2); -NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1); +NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 0); static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) { // We use 2 AllGathers @@ -793,75 +674,21 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); // [RCCL] Collect the PID of the root int rootPid; - NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap, &rootPid)); + NCCLCHECK(bootstrapInit(commId, comm)); // [/RCCL] // AllGather1 - begin - struct { - struct ncclPeerInfo peerInfo; - struct ncclComm* comm; - int cudaCompCap; - } *allGather1Data; - - NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); - allGather1Data[rank].comm = comm; - allGather1Data[rank].cudaCompCap = ncclCudaCompCap(); - struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo; - NCCLCHECK(fillInfo(comm, myInfo, commHash)); - NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); - NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root + NCCLCHECK(fillInfo(comm, comm->peerInfo+rank, commHash)); + NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo))); + for (int i = 0; i < nranks; i++) { - memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); - if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) { - WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId); + if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) { + WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId); return ncclInvalidUsage; } } - // Compute intra ranks and minimum CUDA Compute capabilities of intra-node GPUs and all GPUs - int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0; - int intraNodeRank0 = -1, intraNodeRank = -1, intraNodeRanks = 0; - int myCompCap = allGather1Data[rank].cudaCompCap; - int minCompCap = myCompCap, maxCompCap = myCompCap; - for (int i = 0; i < nranks; i++) { - if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) { - // Rank is on same node - if (intraNodeRanks == 0) intraNodeRank0 = i; - if (i == rank) intraNodeRank = intraNodeRanks; - comm->intraNodeGlobalRanks[intraNodeRanks] = i; - comm->rankToIntraNodeRank[i] = intraNodeRanks; - intraNodeRanks++; - if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) { - // Rank is in same process - if (intraProcRanks == 0) intraProcRank0 = i; - if (i == rank) intraProcRank = intraProcRanks; - intraProcRanks++; - } - } - minCompCap = std::min(allGather1Data[i].cudaCompCap, minCompCap); - maxCompCap = std::max(allGather1Data[i].cudaCompCap, maxCompCap); - } - TRACE(NCCL_INIT,"hostHash[%d] %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d", - rank, allGather1Data[rank].peerInfo.hostHash, intraNodeRank, intraNodeRanks, intraNodeRank0); - TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", - rank, allGather1Data[rank].peerInfo.pidHash, intraProcRank, intraProcRanks, intraProcRank0); - if (intraProcRank == -1 || intraProcRank0 == -1 || allGather1Data[intraProcRank0].comm == NULL) { - WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", - rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash, - intraProcRank, intraProcRanks, intraProcRank0); - return ncclInternalError; - } - if (intraNodeRank == -1 || intraNodeRank0 == -1 || intraNodeRanks == 0) { - WARN("Failed to determine intra node ranks rank %d hostHash %lx pidHash %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d", - rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash, - intraNodeRank, intraNodeRanks, intraNodeRank0); - return ncclInternalError; - } - struct ncclComm* intraProcRank0Comm = allGather1Data[intraProcRank0].comm; - uint64_t intraNodeRank0pidHash = allGather1Data[intraNodeRank0].peerInfo.pidHash; - comm->intraNodeRank = intraNodeRank; - // AllGather1 - end // Topo detection / System graph creation @@ -884,11 +711,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm // Print final topology NCCLCHECK(ncclTopoPrint(comm->topo)); + // Set Affinity to a CPU local the our GPU, so that all memory we allocate + // on the host is local. + NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity)); + cpu_set_t affinitySave; + if (CPU_COUNT(&comm->cpuAffinity)) { + sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); + sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + } + ncclResult_t ret; + + // Launch proxy service thread + NCCLCHECK(ncclProxyCreate(comm)); + // Get rings and trees struct ncclTopoGraph ringGraph; ringGraph.id = 0; ringGraph.pattern = NCCL_TOPO_PATTERN_RING; - ringGraph.crossNic = ncclParamCrossNic(); ringGraph.collNet = 0; ringGraph.minChannels = 1; ringGraph.maxChannels = MAXCHANNELS/2; @@ -898,7 +737,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm struct ncclTopoGraph treeGraph; treeGraph.id = 1; treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE; - treeGraph.crossNic = ncclParamCrossNic(); treeGraph.collNet = 0; treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels; treeGraph.maxChannels = ringGraph.nChannels; @@ -909,39 +747,36 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm collNetGraph.id = 2; collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; collNetGraph.collNet = 1; - collNetGraph.crossNic = ncclParamCrossNic(); - collNetGraph.minChannels = 1; - collNetGraph.maxChannels = ringGraph.nChannels; + collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels; NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph)); - bool allXgmi = true; + bool allXgmi = true, hasPeerAccess = true; + // Check that all the GPUs have peer access to one another and are XGMI connected + for (int i = 0; i < nranks && hasPeerAccess; i++) { + int cudaDev1 = comm->peerInfo[i].cudaDev; + for (int j = 0; j < nranks; j++) { + if (i == j) continue; + int cudaDev2 = comm->peerInfo[j].cudaDev; + int p2p; + if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p) + { + hasPeerAccess = false; + break; + } + + bool isXGMI; + // Limit to single intermediate GPU for enabling clique + NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI, 1)); + allXgmi &= isXGMI; + } + } + +#if 0 { // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED; if (comm->localRanks == comm->nRanks && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) { - // Check that all the GPUs have peer access to one another and are XGMI connected - bool hasPeerAccess = true; - for (int i = 0; i < nranks && hasPeerAccess; i++) - { - int cudaDev1 = allGather1Data[i].peerInfo.cudaDev; - for (int j = 0; j < nranks; j++) - { - if (i == j) continue; - int cudaDev2 = allGather1Data[j].peerInfo.cudaDev; - int p2p; - if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p) - { - hasPeerAccess = false; - break; - } - - bool isXGMI; - // Limit to single intermediate GPU for enabling clique - NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI, 1)); - allXgmi &= isXGMI; - } - } if (hasPeerAccess) { if (intraProcRanks == nranks) @@ -960,6 +795,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode); NCCLCHECK(comm->cliqueManager->Init(commId, rootPid)); } // [/RCCL] +#endif if (comm->rank == ncclParamGraphDumpFileRank()) { struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph }; @@ -967,11 +803,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm } // Determine local CollNet support before all-gather - if (ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1; - if (intraNodeRanks > 8) { - if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node"); - comm->collNetSupport = 0; + if (collNetSupport()) { + char *collNetEnable = getenv("NCCL_COLLNET_ENABLE"); + if (collNetEnable != NULL) { + INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable); + if (strcmp(collNetEnable, "1") == 0) { + comm->collNetSupport = 1; + } + } } + if (comm->collNetSupport == 1 && collNetGraph.nChannels <= 0) comm->collNetSupport = 0; if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) { if (rcclParamP2pNetDisable() == 0) { @@ -993,6 +834,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm }; struct { + int netDev; int collNetSupport; int nc; struct ncclGraphInfo tree; @@ -1004,7 +846,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); int idx; - NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx)); + NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx)); allGather3Data[rank].nc = 2; if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi) allGather3Data[rank].nc = 4; @@ -1018,6 +860,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels); if (ringGraph.nChannels > MAXCHANNELS/2) allGather3Data[rank].nc = 1; + NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev)); allGather3Data[rank].tree.pattern = treeGraph.pattern; allGather3Data[rank].tree.nChannels = treeGraph.nChannels; allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; @@ -1052,19 +895,50 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int *nodesFirstRank, *nodesTreePatterns; NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks)); NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks)); - for (int i=0; inNodes; n++) { - if (nodesFirstRank[n] == firstRank) node = n; - } - if (node == -1) { - node = comm->nNodes++; + NCCLCHECK(ncclCalloc(&comm->rankToNode, comm->nRanks)); + for (int r=0; rnNodes && nodesFirstRank[node] != firstRank; node++); + if (node == comm->nNodes) { + comm->nNodes++; nodesFirstRank[node] = firstRank; // Record tree pattern of each node as they can be different depending on sm arch - nodesTreePatterns[node] = allGather3Data[i].tree.pattern; + nodesTreePatterns[node] = allGather3Data[r].tree.pattern; } - if (i == comm->rank) comm->node = node; + comm->rankToNode[r] = node; + } + // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node + NCCLCHECK(ncclCalloc(&comm->nodeRanks, comm->nNodes)); + NCCLCHECK(ncclCalloc(&comm->rankToLocalRank, comm->nRanks)); + for (int r=0; rnRanks; r++) { + int node = comm->rankToNode[r]; + comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks; + comm->nodeRanks[node].localRanks++; + } + // Allocate ranks arrays for each node + for (int n=0; nnNodes; n++) { + NCCLCHECK(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks)); + comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks); + comm->nodeRanks[n].localRanks = 0; + } + // And fill the ranks arrays + for (int r=0; rnRanks; r++) { + int node = comm->rankToNode[r]; + comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r; + } + comm->node = comm->rankToNode[rank]; + comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank; + comm->localRank = comm->rankToLocalRank[rank]; + comm->localRanks = comm->nodeRanks[comm->node].localRanks; + + TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d", + rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]); + if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) { + WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d", + rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash, + comm->localRank, comm->localRanks, comm->localRankToRank[0]); + return ncclInternalError; } int nChannelsOrig = comm->nChannels; @@ -1072,6 +946,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks)); int nc = allGather3Data[0].nc; for (int i=0; ipeerInfo[i].netDev = allGather3Data[i].netDev; allTopoRanks[i] = &allGather3Data[i].topoRanks; nc = std::min(allGather3Data[i].nc, nc); // Make sure we align all ranks so that the tuning is consistent across ranks @@ -1079,20 +954,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra); treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter); - treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); - treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter); + treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); + treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter); ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels); ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra); ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter); - ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); - ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter); + ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); + ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter); collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels); collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels); collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra); collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter); - collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); - collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); + collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); + collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport); comm->topo->pivotA2AEnabled = comm->topo->pivotA2AEnabled && allGather3Data[i].pivotA2AEnabled; } @@ -1106,12 +981,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm for (int i=0; inChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel)); } - // Determine CollNet support after all-gather now that we know nNodes - int collNetNodeThreshold = ncclParamCollNetNodeThreshold(); - if (comm->nNodes < collNetNodeThreshold) { - if (comm->collNetSupport == 1) + // Determine CollNet support after all-gather now that we know nNodes and each node localRanks + if (comm->collNetSupport == 1) { + int collNetNodeThreshold = ncclParamCollNetNodeThreshold(); + if (comm->nNodes < collNetNodeThreshold) { INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold); - comm->collNetSupport = 0; + comm->collNetSupport = 0; + } + for (int n=0; nnNodes; n++) { + if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) { + WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1); + comm->collNetSupport = 0; + break; + } + } } int *rings; @@ -1121,7 +1004,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm free(allTopoRanks); free(nodesTreePatterns); free(nodesFirstRank); - free(allGather1Data); free(allGather3Data); // AllGather3 - end @@ -1140,16 +1022,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm line[1023] = '\0'; INFO(NCCL_INIT, "Trees%s comm %p nRanks %02d busId %lx", line, comm, comm->nRanks, comm->busId); - // Set Affinity to a CPU local the our GPU, so that all memory we allocate - // on the host is local. - NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity)); - cpu_set_t affinitySave; - if (CPU_COUNT(&comm->cpuAffinity)) { - sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); - } - ncclResult_t ret; - NCCLCHECK(computeBuffSizes(comm)); // Connect with prev/next for each ring @@ -1186,7 +1058,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm // Check if we can setup CollNet if (comm->collNetSupport > 0) { int collNetSetupFail = 0; - int highestTypes[NCCL_MAX_INTRA_RANKS] = {TRANSPORT_P2P}; + int highestTypes[NCCL_MAX_LOCAL_RANKS] = {TRANSPORT_P2P}; // Find all head ranks int nHeads = collNetGraph.nChannels; int *heads; @@ -1200,7 +1072,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm for (int h=0; hintraHighestTransportType = highestTypes[comm->intraNodeRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; - NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, highestTypes, sizeof(int))); + comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; + NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int))); for (int i=0; ilocalRanks; i++) { if (highestTypes[i] > comm->intraHighestTransportType) comm->intraHighestTransportType = highestTypes[i]; @@ -1245,7 +1117,15 @@ collnet_cleanup: TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); // Compute time models for algorithm and protocol combinations - NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph)); + do { + int myCompCap = comm->peerInfo[rank].cudaCompCap; + int minCompCap = myCompCap, maxCompCap = myCompCap; + for (int i = 0; i < nranks; i++) { + minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap); + maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap); + } + NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph)); + } while(0); // Compute nChannels per peer for p2p NCCLCHECK(ncclTopoComputeP2pChannels(comm)); @@ -1260,28 +1140,67 @@ collnet_cleanup: int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks; for (int c=0; cp2pnChannelsPerPeer; c++) { int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; - if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector + if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector comm->connectRecv[peer] |= (1<nRanks - (comm->rank-peer)) % comm->nRanks; for (int c=0; cp2pnChannelsPerPeer; c++) { int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; - if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector + if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector comm->connectSend[peer] |= (1<rank, &proxyConn)); + NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0)); + + // Then to remote ones when using PXN + if (ncclPxnDisable() == 0) { + int nranks; + int* pxnPeers; + NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks)); + for (int r=0; rp2pnChannels, sizeof(int), NULL, 0)); + } + free(pxnPeers); + } + + do { + // Compute intra-process ranks + int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0; + for (int i = 0; i < nranks; i++) { + if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) + && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) { + // Rank is in same process + if (intraProcRanks == 0) intraProcRank0 = i; + if (i == rank) intraProcRank = intraProcRanks; + intraProcRanks++; + } + } + TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", + rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0); + if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) { + WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", + rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash, + intraProcRank, intraProcRanks, intraProcRank0); + return ncclInternalError; + } + NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm)); + } while(0); /* Local intra-node barrier */ - NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->intraNodeGlobalRanks, intraNodeRank, intraNodeRanks, (int)intraNodeRank0pidHash)); + NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); - if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm)); + // Unlink proxy shm to make sure it will be properly cleaned up. + NCCLCHECK(ncclProxyShmUnlink(comm)); // We should have allocated all buffers, collective fifos, ... we can // restore the affinity. @@ -1301,15 +1220,16 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId CUDACHECK(hipSetDevice(cudaDev)); // Set the maximum kernel stack size of all kernels to avoid // a CUDA memory reconfig on load (c.f. NVSHMEM issue) - //if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) { - // TRACE(NCCL_INIT, "Setting hipLimitStackSize to %zi", maxLocalSizeBytes); - // CUDACHECKIGNORE(hipDeviceSetLimit(hipLimitStackSize, maxLocalSizeBytes)); - //} + if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) { + TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes); + //CUDACHECKIGNORE(hipDeviceSetLimit(hipLimitStackSize, maxLocalSizeBytes)); + } + *newcomm = NULL; NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, allocTracker[(*newcomm)->cudaDev].totalAllocSize); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx localSize %ld used %ld bytes - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId, maxLocalSizeBytes, allocTracker[(*newcomm)->cudaDev].totalAllocSize); return ncclSuccess; cleanup: @@ -1397,6 +1317,12 @@ static ncclResult_t ncclGraphHelperDestroy(ncclComm* comm) { } static ncclResult_t commDestroy(ncclComm_t comm) { + // Try and prevent a double free of the comm struct (user error) + if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) { + WARN("comm %p has already been destroyed", comm); + return ncclInvalidArgument; + } + int savedDevice; #ifdef ENABLE_TRACE int rank = comm->rank; @@ -1411,19 +1337,18 @@ static ncclResult_t commDestroy(ncclComm_t comm) { TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, LOAD(comm->abortFlag), comm->fatalError); CUDACHECK(hipStreamSynchronize(comm->groupStream)); - NCCLCHECK(ncclProxyDestroy(comm)); + ncclDestroyQueueInfo(comm->enqueueInfo); #if CUDART_VERSION >= 11030 NCCLCHECK(ncclGraphHelperDestroy(comm)); #endif INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed); + NCCLCHECK(commFree(comm)); if (savedDevice != commDevice) CUDACHECK(hipSetDevice(savedDevice)); - TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank); - return ncclSuccess; } @@ -1433,19 +1358,17 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; - TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId); - - // Try and prevent a double free of the comm struct (user error) - if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) { - WARN("comm %p has already been destroyed", comm); - return ncclInvalidArgument; - } + int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; + int64_t busId = comm->busId; + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId); // [RCCL] Delete CliqueManager if it exists - if (comm->cliqueManager) delete comm->cliqueManager; + //if (comm->cliqueManager) delete comm->cliqueManager; // [/RCCL] - return commDestroy(comm); + NCCLCHECK(commDestroy(comm)); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Destroy COMPLETE", comm, rank, nranks, cudaDev, busId); + return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); @@ -1454,11 +1377,16 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; + int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; + int64_t busId = comm->busId; + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId); + // Ask anything that might still be running on the device to quit *comm->abortFlag = 1; // do not destroy comm because kernel maybe still running // return commDestroy(comm); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Abort COMPLETE", comm, rank, nranks, cudaDev, busId); return ncclSuccess; } diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc index c39b06de6d..538336c873 100644 --- a/src/misc/argcheck.cc +++ b/src/misc/argcheck.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,7 +12,7 @@ static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, con hipPointerAttribute_t attr; hipError_t err = hipPointerGetAttributes(&attr, pointer); if (err != hipSuccess || attr.devicePointer == NULL) { - WARN("%s : %s is not a valid pointer", opname, ptrname); + WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer); return ncclInvalidArgument; } #if CUDART_VERSION >= 10000 @@ -64,12 +64,9 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) { } if (info->comm->checkPointers) { - if (info->coll == ncclFuncSendRecv) { - if (strcmp(info->opName, "Send") == 0) { - NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", "Send")); - } else { - NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", "Recv")); - } + if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) { + if (info->count >0) + NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName)); } else { // Check CUDA device pointers if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) { diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc index 439712e88f..e1aabac8f4 100644 --- a/src/misc/ibvwrap.cc +++ b/src/misc/ibvwrap.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -29,6 +29,7 @@ int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int at struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context); int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd); struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); +struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); int (*ibv_internal_dereg_mr)(struct ibv_mr *mr); struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); int (*ibv_internal_destroy_cq)(struct ibv_cq *cq); @@ -65,7 +66,7 @@ ncclResult_t wrap_ibv_symbols(void) { } } -#define LOAD_SYM(handle, symbol, funcptr) do { \ +#define LOAD_SYM(handle, symbol, funcptr) do { \ cast = (void**)&funcptr; \ tmp = dlvsym(handle, symbol, IBVERBS_VERSION); \ if (tmp == NULL) { \ @@ -75,6 +76,12 @@ ncclResult_t wrap_ibv_symbols(void) { *cast = tmp; \ } while (0) +// Attempt to load a specific symbol version - fail silently +#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \ + cast = (void**)&funcptr; \ + *cast = dlvsym(handle, symbol, version); \ + } while (0) + LOAD_SYM(ibvhandle, "ibv_get_device_list", ibv_internal_get_device_list); LOAD_SYM(ibvhandle, "ibv_free_device_list", ibv_internal_free_device_list); LOAD_SYM(ibvhandle, "ibv_get_device_name", ibv_internal_get_device_name); @@ -89,6 +96,8 @@ ncclResult_t wrap_ibv_symbols(void) { LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibv_internal_alloc_pd); LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibv_internal_dealloc_pd); LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr); + // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8 + LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8"); LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr); LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq); LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq); @@ -116,6 +125,7 @@ teardown: ibv_internal_alloc_pd = NULL; ibv_internal_dealloc_pd = NULL; ibv_internal_reg_mr = NULL; + ibv_internal_reg_mr_iova2 = NULL; ibv_internal_dereg_mr = NULL; ibv_internal_create_cq = NULL; ibv_internal_destroy_cq = NULL; @@ -260,6 +270,14 @@ struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t len return ibv_internal_reg_mr(pd, addr, length, access); } +ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) { + if (ibv_internal_reg_mr_iova2 == NULL) { + return ncclInternalError; + } + if (ret == NULL) { return ncclSuccess; } // Assume dummy call + IBV_PTR_CHECK(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2"); +} + ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO(ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr"); } diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc index e83392dcd5..5db7c6be5c 100644 --- a/src/misc/nvmlwrap.cc +++ b/src/misc/nvmlwrap.cc @@ -1,219 +1,262 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nvmlwrap.h" +#include "checks.h" +#include "debug.h" -#ifndef NVML_DIRECT -#include -#include "core.h" +#include +#include +#include -static enum { nvmlUninitialized, nvmlInitializing, nvmlInitialized, nvmlError } nvmlState = nvmlUninitialized; +int ncclNvmlDeviceCount = 0; +ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices]; +ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices]; -static nvmlReturn_t (*nvmlInternalInit)(void); -static nvmlReturn_t (*nvmlInternalShutdown)(void); -static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device); -static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index); -static const char* (*nvmlInternalErrorString)(nvmlReturn_t r); -static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); -static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); -static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult); -static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor); - -// Used to make the NVML library calls thread safe -pthread_mutex_t nvmlLock = PTHREAD_MUTEX_INITIALIZER; - -ncclResult_t wrapNvmlSymbols(void) { - if (nvmlState == nvmlInitialized) - return ncclSuccess; - if (nvmlState == nvmlError) - return ncclSystemError; - - if (__sync_bool_compare_and_swap(&nvmlState, nvmlUninitialized, nvmlInitializing) == false) { - // Another thread raced in front of us. Wait for it to be done. - while (nvmlState == nvmlInitializing) pthread_yield(); - return (nvmlState == nvmlInitialized) ? ncclSuccess : ncclSystemError; - } - - static void* nvmlhandle = NULL; - void* tmp; - void** cast; - - nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW); - if (!nvmlhandle) { - WARN("Failed to open libnvidia-ml.so.1"); - goto teardown; - } - -#define LOAD_SYM(handle, symbol, funcptr) do { \ - cast = (void**)&funcptr; \ - tmp = dlsym(handle, symbol); \ - if (tmp == NULL) { \ - WARN("dlsym failed on %s - %s", symbol, dlerror());\ - goto teardown; \ - } \ - *cast = tmp; \ - } while (0) - -#define LOAD_SYM_OPTIONAL(handle, symbol, funcptr) do {\ - cast = (void**)&funcptr; \ - tmp = dlsym(handle, symbol); \ - if (tmp == NULL) { \ - INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \ - } \ - *cast = tmp; \ - } while (0) - - LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit); - LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown); - LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId); - LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex); - LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString); - LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState); - LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo); - LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability); - LOAD_SYM(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", nvmlInternalDeviceGetCudaComputeCapability); - - nvmlState = nvmlInitialized; - return ncclSuccess; - -teardown: - nvmlInternalInit = NULL; - nvmlInternalShutdown = NULL; - nvmlInternalDeviceGetHandleByPciBusId = NULL; - nvmlInternalDeviceGetIndex = NULL; - nvmlInternalDeviceGetNvLinkState = NULL; - nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL; - nvmlInternalDeviceGetNvLinkCapability = NULL; - - if (nvmlhandle != NULL) dlclose(nvmlhandle); - nvmlState = nvmlError; - return ncclSystemError; -} - - -ncclResult_t wrapNvmlInit(void) { - if (nvmlInternalInit == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; - } - nvmlReturn_t ret = nvmlInternalInit(); - if (ret != NVML_SUCCESS) { - WARN("nvmlInit() failed: %s", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrapNvmlShutdown(void) { - if (nvmlInternalShutdown == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; - } - nvmlReturn_t ret = nvmlInternalShutdown(); - if (ret != NVML_SUCCESS) { - WARN("nvmlShutdown() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { - if (nvmlInternalDeviceGetHandleByPciBusId == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; - } - nvmlReturn_t ret; - NVMLLOCKCALL(nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device), ret); - if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { - if (nvmlInternalDeviceGetIndex == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; - } - nvmlReturn_t ret; - NVMLLOCKCALL(nvmlInternalDeviceGetIndex(device, index), ret); - if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceGetIndex() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { - if (nvmlInternalDeviceGetNvLinkState == NULL) { - /* Do not warn, this symbol is optional. */ - return ncclInternalError; - } - nvmlReturn_t ret; - NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkState(device, link, isActive), ret); - if (ret != NVML_SUCCESS) { - if (ret != NVML_ERROR_NOT_SUPPORTED) - INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) { - if (nvmlInternalDeviceGetNvLinkRemotePciInfo == NULL) { - /* Do not warn, this symbol is optional. */ - return ncclInternalError; - } - nvmlReturn_t ret; - NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci), ret); - if (ret != NVML_SUCCESS) { - if (ret != NVML_ERROR_NOT_SUPPORTED) - INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult) { - if (nvmlInternalDeviceGetNvLinkCapability == NULL) { - /* Do not warn, this symbol is optional. */ - return ncclInternalError; - } - nvmlReturn_t ret; - NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult), ret); - if (ret != NVML_SUCCESS) { - if (ret != NVML_ERROR_NOT_SUPPORTED) - INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { - if (nvmlInternalDeviceGetNvLinkCapability == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; - } - nvmlReturn_t ret; - NVMLLOCKCALL(nvmlInternalDeviceGetCudaComputeCapability(device, major, minor), ret); - if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceGetCudaComputeCapability() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} +#if NCCL_NVML_DIRECT + #define NCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name)arglist = name; +#else + #include + #define NCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name)arglist = nullptr; #endif + +namespace { + NCCL_NVML_FN(nvmlInit, nvmlReturn_t, ()) + NCCL_NVML_FN(nvmlInit_v2, nvmlReturn_t, ()) + NCCL_NVML_FN(nvmlShutdown, nvmlReturn_t, ()) + NCCL_NVML_FN(nvmlDeviceGetCount, nvmlReturn_t, (unsigned int*)) + NCCL_NVML_FN(nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*)) + NCCL_NVML_FN(nvmlDeviceGetHandleByPciBusId, nvmlReturn_t, (const char* pciBusId, nvmlDevice_t* device)) + NCCL_NVML_FN(nvmlDeviceGetHandleByIndex, nvmlReturn_t, (unsigned int index, nvmlDevice_t *device)) + NCCL_NVML_FN(nvmlDeviceGetIndex, nvmlReturn_t, (nvmlDevice_t device, unsigned* index)) + NCCL_NVML_FN(nvmlErrorString, char const*, (nvmlReturn_t r)) + NCCL_NVML_FN(nvmlDeviceGetNvLinkState, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive)) + NCCL_NVML_FN(nvmlDeviceGetNvLinkRemotePciInfo, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci)) + NCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult)) + NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor)) + NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus)) + + std::mutex lock; // NVML has had some thread safety bugs + bool initialized = false; + thread_local bool threadInitialized = false; + ncclResult_t initResult; +} + +ncclResult_t ncclNvmlEnsureInitialized() { + // Optimization to avoid repeatedly grabbing the lock when we only want to + // read from the global tables. + if (threadInitialized) return initResult; + threadInitialized = true; + + std::lock_guard locked(lock); + + if (initialized) return initResult; + initialized = true; + + #if !NCCL_NVML_DIRECT + if (pfn_nvmlInit == nullptr) { + void *libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW); + if (libhandle == nullptr) { + WARN("Failed to open libnvidia-ml.so.1"); + initResult = ncclSystemError; + return initResult; + } + + struct Symbol { void **ppfn; char const *name; }; + std::initializer_list symbols = { + {(void**)&pfn_nvmlInit, "nvmlInit"}, + {(void**)&pfn_nvmlInit_v2, "nvmlInit_v2"}, + {(void**)&pfn_nvmlShutdown, "nvmlShutdown"}, + {(void**)&pfn_nvmlDeviceGetCount, "nvmlDeviceGetCount"}, + {(void**)&pfn_nvmlDeviceGetCount_v2, "nvmlDeviceGetCount_v2"}, + {(void**)&pfn_nvmlDeviceGetHandleByPciBusId, "nvmlDeviceGetHandleByPciBusId"}, + {(void**)&pfn_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"}, + {(void**)&pfn_nvmlDeviceGetIndex, "nvmlDeviceGetIndex"}, + {(void**)&pfn_nvmlErrorString, "nvmlErrorString"}, + {(void**)&pfn_nvmlDeviceGetNvLinkState, "nvmlDeviceGetNvLinkState"}, + {(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"}, + {(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"}, + {(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"}, + {(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"} + }; + for(Symbol sym: symbols) { + *sym.ppfn = dlsym(libhandle, sym.name); + } + } + #endif + + #if NCCL_NVML_DIRECT + bool have_v2 = true; + #else + bool have_v2 = pfn_nvmlInit_v2 != nullptr; // if this compare is done in the NCCL_NVML_DIRECT=1 case then GCC warns about it never being null + #endif + nvmlReturn_t res1 = (have_v2 ? pfn_nvmlInit_v2 : pfn_nvmlInit)(); + if (res1 != NVML_SUCCESS) { + WARN("nvmlInit%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1)); + initResult = ncclSystemError; + return initResult; + } + + unsigned int ndev; + res1 = (have_v2 ? pfn_nvmlDeviceGetCount_v2 : pfn_nvmlDeviceGetCount)(&ndev); + if (res1 != NVML_SUCCESS) { + WARN("nvmlDeviceGetCount%s() failed: %s", have_v2 ? "_v2" :"", pfn_nvmlErrorString(res1)); + initResult = ncclSystemError; + return initResult; + } + + ncclNvmlDeviceCount = int(ndev); + if (ncclNvmlMaxDevices < ncclNvmlDeviceCount) { + WARN("nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (ncclNvmlMaxDevices=%d)", ncclNvmlDeviceCount, ncclNvmlMaxDevices); + initResult = ncclInternalError; + return initResult; + } + + for(int a=0; a < ncclNvmlDeviceCount; a++) { + res1 = pfn_nvmlDeviceGetHandleByIndex(a, &ncclNvmlDevices[a].handle); + if (res1 != NVML_SUCCESS) { + WARN("nvmlDeviceGetHandleByIndex(%d) failed: %s", int(a), pfn_nvmlErrorString(res1)); + initResult = ncclSystemError; + return initResult; + } + + res1 = pfn_nvmlDeviceGetCudaComputeCapability(ncclNvmlDevices[a].handle, &ncclNvmlDevices[a].computeCapabilityMajor, &ncclNvmlDevices[a].computeCapabilityMinor); + if (res1 != NVML_SUCCESS) { + WARN("nvmlDeviceGetCudaComputeCapability(%d) failed: %s", int(a), pfn_nvmlErrorString(res1)); + initResult = ncclSystemError; + return initResult; + } + } + + for(int a=0; a < ncclNvmlDeviceCount; a++) { + for(int b=0; b < ncclNvmlDeviceCount; b++) { + nvmlDevice_t da = ncclNvmlDevices[a].handle; + nvmlDevice_t db = ncclNvmlDevices[b].handle; + + res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_READ, &ncclNvmlDevicePairs[a][b].p2pStatusRead); + if (res1 != NVML_SUCCESS) { + WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1)); + initResult = ncclSystemError; + return initResult; + } + + res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_WRITE, &ncclNvmlDevicePairs[a][b].p2pStatusWrite); + if (res1 != NVML_SUCCESS) { + WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1)); + initResult = ncclSystemError; + return initResult; + } + } + } + + initResult = ncclSuccess; + return initResult; +} + +#define NVMLCHECK(name, ...) do { \ + nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \ + if (e44241808 != NVML_SUCCESS) { \ + WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \ + return ncclSystemError; \ + } \ +} while(0) + +#define NVMLTRY(name, ...) do { \ + if (!NCCL_NVML_DIRECT && pfn_##name == nullptr) \ + return ncclInternalError; /* missing symbol is not a warned error */ \ + nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \ + if (e44241808 != NVML_SUCCESS) { \ + if (e44241808 != NVML_ERROR_NOT_SUPPORTED) \ + INFO(NCCL_INIT, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \ + return ncclSystemError; \ + } \ +} while(0) + +ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + std::lock_guard locked(lock); + NVMLCHECK(nvmlDeviceGetHandleByPciBusId, pciBusId, device); + return ncclSuccess; +} + +ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + *device = ncclNvmlDevices[index].handle; + return ncclSuccess; +} + +ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + for (int d=0; d < ncclNvmlDeviceCount; d++) { + if (ncclNvmlDevices[d].handle == device) { + *index = d; + return ncclSuccess; + } + } + return ncclInvalidArgument; +} + +ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + std::lock_guard locked(lock); + NVMLTRY(nvmlDeviceGetNvLinkState, device, link, isActive); + return ncclSuccess; +} + +ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + std::lock_guard locked(lock); + NVMLTRY(nvmlDeviceGetNvLinkRemotePciInfo, device, link, pci); + return ncclSuccess; +} + +ncclResult_t ncclNvmlDeviceGetNvLinkCapability( + nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, + unsigned int *capResult + ) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + std::lock_guard locked(lock); + NVMLTRY(nvmlDeviceGetNvLinkCapability, device, link, capability, capResult); + return ncclSuccess; +} + +ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + + for(int d=0; d < ncclNvmlDeviceCount; d++) { + if(device == ncclNvmlDevices[d].handle) { + *major = ncclNvmlDevices[d].computeCapabilityMajor; + *minor = ncclNvmlDevices[d].computeCapabilityMinor; + return ncclSuccess; + } + } + return ncclInvalidArgument; +} + +ncclResult_t ncclNvmlDeviceGetP2PStatus( + nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, + nvmlGpuP2PStatus_t* p2pStatus + ) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + + if (p2pIndex == NVML_P2P_CAPS_INDEX_READ || p2pIndex == NVML_P2P_CAPS_INDEX_WRITE) { + int a = -1, b = -1; + for(int d=0; d < ncclNvmlDeviceCount; d++) { + if(device1 == ncclNvmlDevices[d].handle) a = d; + if(device2 == ncclNvmlDevices[d].handle) b = d; + } + if (a == -1 || b == -1) return ncclInvalidArgument; + if (p2pIndex == NVML_P2P_CAPS_INDEX_READ) + *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusRead; + else + *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusWrite; + } + else { + std::lock_guard locked(lock); + NVMLCHECK(nvmlDeviceGetP2PStatus, device1, device2, p2pIndex, p2pStatus); + } + return ncclSuccess; +} diff --git a/src/misc/nvmlwrap_stub.cc b/src/misc/nvmlwrap_stub.cc index 41485df101..a9462f0d04 100644 --- a/src/misc/nvmlwrap_stub.cc +++ b/src/misc/nvmlwrap_stub.cc @@ -7,50 +7,50 @@ #include "nvmlwrap.h" -ncclResult_t wrapNvmlSymbols(void) { +ncclResult_t ncclNvmlSymbols(void) { return ncclSuccess; } -ncclResult_t wrapNvmlInit(void) { +ncclResult_t ncclNvmlInit(void) { return ncclSuccess; } -ncclResult_t wrapNvmlShutdown(void) { +ncclResult_t ncclNvmlShutdown(void) { return ncclSuccess; } -ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { +ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { return ncclSystemError; } -ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { +ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { *index = 0; return ncclSuccess; } -ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { +ncclResult_t ncclNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { return ncclSystemError; } -ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) { +ncclResult_t ncclNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) { *minorNumber = 0; return ncclSuccess; } -ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { +ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { return ncclSystemError; } -ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) { +ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) { return ncclSystemError; } -ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, +ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult) { return ncclSystemError; } -ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { +ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { *major = *minor = 1; return ncclSuccess; } diff --git a/src/misc/param.cc b/src/misc/param.cc new file mode 100644 index 0000000000..a59713cf3b --- /dev/null +++ b/src/misc/param.cc @@ -0,0 +1,81 @@ +/************************************************************************* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "param.h" +#include "debug.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const char* userHomeDir() { + struct passwd *pwUser = getpwuid(getuid()); + return pwUser == NULL ? NULL : pwUser->pw_dir; +} + +void setEnvFile(const char* fileName) { + FILE * file = fopen(fileName, "r"); + if (file == NULL) return; + + char *line = NULL; + char envVar[1024]; + char envValue[1024]; + size_t n = 0; + ssize_t read; + while ((read = getline(&line, &n, file)) != -1) { + if (line[read-1] == '\n') line[read-1] = '\0'; + int s=0; // Env Var Size + while (line[s] != '\0' && line[s] != '=') s++; + if (line[s] == '\0') continue; + strncpy(envVar, line, std::min(1023,s)); + envVar[s] = '\0'; + s++; + strncpy(envValue, line+s, 1023); + envValue[1023]='\0'; + setenv(envVar, envValue, 0); + //printf("%s : %s->%s\n", fileName, envVar, envValue); + } + if (line) free(line); + fclose(file); +} + +void initEnv() { + char confFilePath[1024]; + const char * userDir = userHomeDir(); + if (userDir) { + sprintf(confFilePath, "%s/.rccl.conf", userDir); + setEnvFile(confFilePath); + } + sprintf(confFilePath, "/etc/rccl.conf"); + setEnvFile(confFilePath); +} + +void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) { + static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + pthread_mutex_lock(&mutex); + if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) { + char* str = getenv(env); + int64_t value = deftVal; + if (str && strlen(str) > 0) { + errno = 0; + value = strtoll(str, nullptr, 0); + if (errno) { + value = deftVal; + INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal); + } else { + INFO(NCCL_ALL,"%s set by environment to %lld.", env, (long long)value); + } + } + __atomic_store_n(cache, value, __ATOMIC_RELAXED); + } + pthread_mutex_unlock(&mutex); +} diff --git a/src/misc/profiler.cc b/src/misc/profiler.cc new file mode 100644 index 0000000000..145b18fe8c --- /dev/null +++ b/src/misc/profiler.cc @@ -0,0 +1,115 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "profiler.h" + +//#define PROFILE_PROXY 1 +#ifdef PROFILE_PROXY +#include "timer.h" +#include "alloc.h" + +static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" }; +static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" }; +static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" }; +struct ncclProxyProfileEvent { + double timestamp[6]; + uint64_t opCount; + int peer; + int step; + uint16_t channel; + uint8_t type; // send / recv + uint8_t opIndex; +}; + +struct ncclProxyProfileEvent* profilingEvents = NULL; +int profilingIndex = 0; +double profilingStart = 0; +#define MAX_EVENTS 200000 + +ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { + if (profilingEvents == NULL) { + NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS)); + profilingStart = gettime(); + } + struct ncclProxyProfileEvent* event = NULL; + if (state%8 == 0) { + if (profilingIndex == MAX_EVENTS) return ncclSuccess; + args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++; + if (state == ncclProxyProfileBegin) { + // Proxy operation information + event->opCount = args->opCount; + event->channel = args->subs[sub].channelId; + event->peer = args->subs[sub].peer; + event->type = args->pattern; + event->step = step; + event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256; + } else event->peer = -state; + } else { + event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS]; + if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL; + if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount; + } + // Timestamp + event->timestamp[state%8] = gettime()-profilingStart; + return ncclSuccess; +} + +void ncclProfilingDump() { + static int dumpDone = 0; + if (dumpDone) return; + dumpDone = 1; + const char* str = getenv("NCCL_PROXY_PROFILE"); + if (!str) { free(profilingEvents); return; } + FILE* f = fopen(str, "w"); + fprintf(f, "[\n"); + + for (int i=0; ipeer >= 0; + const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") : + profilingEventStr[-(e->peer/8)]; + + + if (sendrecv) { + int state = ncclProxyProfileBegin; + const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr; + fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n", + typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex); + + while (statetimestamp[state]) { + const char* name = stateStr[state]; + fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", + name, i, e->channel, e->timestamp[state]); + state++; + while (e->timestamp[state] == 0) state++; + fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", + name, i, e->channel, e->timestamp[state]); + } + } + + fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", + typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]); + } else { + if (e->peer == -ncclProxyProfileAppend) { + fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n", + typeStr, i, e->timestamp[0], e->opCount); + } else { + fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n", + typeStr, i, e->timestamp[0]); + } + fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n", + typeStr, i, e->timestamp[1]); + } + } + fprintf(f, "{} ]\n"); + fclose(f); + free(profilingEvents); +} +#else +ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } +void ncclProfilingDump() {} +#endif diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc new file mode 100644 index 0000000000..a8a3c4e534 --- /dev/null +++ b/src/misc/shmutils.cc @@ -0,0 +1,90 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "shm.h" +#include "checks.h" +#include +#include +#include +#include +#include +#include +#include +#include + +// Change functions behavior to match other SYS functions +static int shm_allocate(int fd, const int shmSize) { + int err = posix_fallocate(fd, 0, shmSize); + if (err) { errno = err; return -1; } + return 0; +} +static int shm_map(int fd, const int shmSize, void** ptr) { + *ptr = mmap(NULL, shmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + return (*ptr == MAP_FAILED) ? -1 : 0; +} + +static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void** ptr, int create) { + if (create) { + if (shmPath[0] == '\0') { + sprintf(shmPath, "/dev/shm/nccl-XXXXXX"); + *fd = mkstemp(shmPath); + } else { + SYSCHECKVAL(open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", *fd); + } + if (ftruncate(*fd, shmSize) != 0) { + WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize); + return ncclSystemError; + } + } else { + SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd); + } + *ptr = (char*)mmap(NULL, shmSize, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0); + if (*ptr == NULL) { + WARN("Could not map %s\n", shmPath); + return ncclSystemError; + } + close(*fd); + *fd = -1; + if (create) memset(*ptr, 0, shmSize); + return ncclSuccess; +} + +ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create) { + int fd = -1; + void* ptr = MAP_FAILED; + ncclResult_t res = ncclSuccess; + + NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError); + if (devShmPtr) { + CUDACHECKGOTO(hipHostRegister(ptr, shmSize, hipHostRegisterMapped), res, cudaError); + CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError); + } + + *shmPtr = ptr; + return ncclSuccess; +sysError: + WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize); +cudaError: + if (fd != -1) close(fd); + if (create) shm_unlink(shmPath); + if (ptr != MAP_FAILED) munmap(ptr, shmSize); + *shmPtr = NULL; + return res; +} + +ncclResult_t ncclShmUnlink(const char* shmPath) { + if (shmPath != NULL) SYSCHECK(unlink(shmPath), "unlink"); + return ncclSuccess; +} + +ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) { + if (devShmPtr) CUDACHECK(hipHostUnregister(shmPtr)); + if (munmap(shmPtr, shmSize) != 0) { + WARN("munmap of shared memory failed"); + return ncclSystemError; + } + return ncclSuccess; +} diff --git a/src/misc/socket.cc b/src/misc/socket.cc new file mode 100644 index 0000000000..ef2bea65a5 --- /dev/null +++ b/src/misc/socket.cc @@ -0,0 +1,556 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "socket.h" +#include "utils.h" +#include + +#include +#include +#include + +/* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo() + * + * Output: "IPv4/IPv6 address" + */ +const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) { + if (buf == NULL || addr == NULL) return NULL; + struct sockaddr *saddr = &addr->sa; + if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; } + char host[NI_MAXHOST], service[NI_MAXSERV]; + /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned. + * (When not set, this will still happen in case the node's name cannot be determined.) + */ + int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0); + (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag); + sprintf(buf, "%s<%s>", host, service); + return buf; +} + +static uint16_t socketToPort(union ncclSocketAddress *addr) { + struct sockaddr *saddr = &addr->sa; + return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port); +} + +/* Allow the user to force the IPv4/IPv6 interface selection */ +static int envSocketFamily(void) { + int family = -1; // Family selection is not forced, will use first one found + char* env = getenv("NCCL_SOCKET_FAMILY"); + if (env == NULL) + return family; + + INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env); + + if (strcmp(env, "AF_INET") == 0) + family = AF_INET; // IPv4 + else if (strcmp(env, "AF_INET6") == 0) + family = AF_INET6; // IPv6 + return family; +} + +static int findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) { +#ifdef ENABLE_TRACE + char line[SOCKET_NAME_MAXLEN+1]; +#endif + struct netIf userIfs[MAX_IFS]; + bool searchNot = prefixList && prefixList[0] == '^'; + if (searchNot) prefixList++; + bool searchExact = prefixList && prefixList[0] == '='; + if (searchExact) prefixList++; + int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); + + int found = 0; + struct ifaddrs *interfaces, *interface; + getifaddrs(&interfaces); + for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) { + if (interface->ifa_addr == NULL) continue; + + /* We only support IPv4 & IPv6 */ + int family = interface->ifa_addr->sa_family; + if (family != AF_INET && family != AF_INET6) + continue; + + TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line)); + + /* Allow the caller to force the socket family type */ + if (sock_family != -1 && family != sock_family) + continue; + + /* We also need to skip IPv6 loopback interfaces */ + if (family == AF_INET6) { + struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr); + if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue; + } + + // check against user specified interfaces + if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) { + continue; + } + + // Check that this interface has not already been saved + // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link + bool duplicate = false; + for (int i = 0; i < found; i++) { + if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; } + } + + if (!duplicate) { + // Store the interface name + strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize); + // Store the IP address + int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + memcpy(addrs+found, interface->ifa_addr, salen); + found++; + } + } + + freeifaddrs(interfaces); + return found; +} + +static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) { + /* Check family first */ + int family = local_if.ifa_addr->sa_family; + if (family != remote->sa.sa_family) { + return false; + } + + if (family == AF_INET) { + struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr); + struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask); + struct sockaddr_in& remote_addr = remote->sin; + struct in_addr local_subnet, remote_subnet; + local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr; + remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr; + return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true; + } else if (family == AF_INET6) { + struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr); + struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask); + struct sockaddr_in6& remote_addr = remote->sin6; + struct in6_addr& local_in6 = local_addr->sin6_addr; + struct in6_addr& mask_in6 = mask->sin6_addr; + struct in6_addr& remote_in6 = remote_addr.sin6_addr; + bool same = true; + int len = 16; //IPv6 address is 16 unsigned char + for (int c = 0; c < len; c++) { //Network byte order is big-endian + char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c]; + char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c]; + if (c1 ^ c2) { + same = false; + break; + } + } + // At last, we need to compare scope id + // Two Link-type addresses can have the same subnet address even though they are not in the same scope + // For Global type, this field is 0, so a comparison wouldn't matter + same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id); + return same; + } else { + WARN("Net : Unsupported address family type"); + return false; + } +} + +int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) { +#ifdef ENABLE_TRACE + char line[SOCKET_NAME_MAXLEN+1]; +#endif + char line_a[SOCKET_NAME_MAXLEN+1]; + int found = 0; + struct ifaddrs *interfaces, *interface; + getifaddrs(&interfaces); + for (interface = interfaces; interface && !found; interface = interface->ifa_next) { + if (interface->ifa_addr == NULL) continue; + + /* We only support IPv4 & IPv6 */ + int family = interface->ifa_addr->sa_family; + if (family != AF_INET && family != AF_INET6) + continue; + + // check against user specified interfaces + if (!matchSubnet(*interface, remoteAddr)) { + continue; + } + + // Store the local IP address + int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + memcpy(localAddrs+found, interface->ifa_addr, salen); + + // Store the interface name + strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); + + TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, ncclSocketToString(localAddrs+found, line), ncclSocketToString(remoteAddr, line_a)); + found++; + if (found == maxIfs) break; + } + + if (found == 0) { + WARN("Net : No interface found in the same subnet as remote address %s", ncclSocketToString(remoteAddr, line_a)); + } + freeifaddrs(interfaces); + return found; +} + +ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) { + if (!(ip_port_pair && strlen(ip_port_pair) > 1)) { + WARN("Net : string is null"); + return ncclInvalidArgument; + } + + bool ipv6 = ip_port_pair[0] == '['; + /* Construct the sockaddress structure */ + if (!ipv6) { + struct netIf ni; + // parse : string, expect one pair + if (parseStringList(ip_port_pair, &ni, 1) != 1) { + WARN("Net : No valid : pair found"); + return ncclInvalidArgument; + } + + struct addrinfo hints, *p; + int rv; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + + if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) { + WARN("Net : error encountered when getting address info : %s", gai_strerror(rv)); + return ncclInvalidArgument; + } + + // use the first + if (p->ai_family == AF_INET) { + struct sockaddr_in& sin = ua->sin; + memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in)); + sin.sin_family = AF_INET; // IPv4 + //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address + sin.sin_port = htons(ni.port); // port + } else if (p->ai_family == AF_INET6) { + struct sockaddr_in6& sin6 = ua->sin6; + memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6)); + sin6.sin6_family = AF_INET6; // IPv6 + sin6.sin6_port = htons(ni.port); // port + sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete + sin6.sin6_scope_id = 0; // should be global scope, set to 0 + } else { + WARN("Net : unsupported IP family"); + return ncclInvalidArgument; + } + + freeaddrinfo(p); // all done with this structure + + } else { + int i, j = -1, len = strlen(ip_port_pair); + for (i = 1; i < len; i++) { + if (ip_port_pair[i] == '%') j = i; + if (ip_port_pair[i] == ']') break; + } + if (i == len) { + WARN("Net : No valid [IPv6]:port pair found"); + return ncclInvalidArgument; + } + bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope + + char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ]; + memset(ip_str, '\0', sizeof(ip_str)); + memset(port_str, '\0', sizeof(port_str)); + memset(if_name, '\0', sizeof(if_name)); + strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1); + strncpy(port_str, ip_port_pair+i+2, len-i-1); + int port = atoi(port_str); + if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name + + struct sockaddr_in6& sin6 = ua->sin6; + sin6.sin6_family = AF_INET6; // IPv6 + inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address + sin6.sin6_port = htons(port); // port + sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete + sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope + } + return ncclSuccess; +} + +int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) { + static int shownIfName = 0; + int nIfs = 0; + // Allow user to force the INET socket family selection + int sock_family = envSocketFamily(); + // User specified interface + char* env = getenv("NCCL_SOCKET_IFNAME"); + if (env && strlen(env) > 1) { + INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env); + // Specified by user : find or fail + if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env); + nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + } else { + // Try to automatically pick the right one + // Start with IB + nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + // else see if we can get some hint from COMM ID + if (nIfs == 0) { + char* commId = getenv("NCCL_COMM_ID"); + if (commId && strlen(commId) > 1) { + INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId); + // Try to find interface that is in the same subnet as the IP in comm id + union ncclSocketAddress idAddr; + ncclGetSocketAddrFromString(&idAddr, commId); + nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs); + } + } + // Then look for anything else (but not docker or lo) + if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + // Finally look for docker, then lo. + if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + } + return nIfs; +} + +ncclResult_t ncclSocketListen(struct ncclSocket* sock) { + /* IPv4/IPv6 support */ + int family = sock->addr.sa.sa_family; + int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + int flags; + + /* Create socket and bind it to a port */ + int fd = socket(family, SOCK_STREAM, 0); + if (fd == -1) { + WARN("Net : Socket creation failed : %s", strerror(errno)); + return ncclSystemError; + } + + if (socketToPort(&sock->addr)) { + // Port is forced by env. Make sure we get the port. + int opt = 1; +#if defined(SO_REUSEPORT) + SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt"); +#else + SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt"); +#endif + } + + /* make all new sockets non-blocking */ + EQCHECK(flags = fcntl(fd, F_GETFL), -1); + SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); + + // addr port should be 0 (Any port) + SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind"); + + /* Get the assigned Port */ + socklen_t size = salen; + SYSCHECK(getsockname(fd, &sock->addr.sa, &size), "getsockname"); + +#ifdef ENABLE_TRACE + char line[SOCKET_NAME_MAXLEN+1]; + TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", ncclSocketToString(&sock->addr, line)); +#endif + + /* Put the socket in listen mode + * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn + */ + SYSCHECK(listen(fd, 16384), "listen"); + sock->fd = fd; + return ncclSuccess; +} + +static ncclResult_t getFdState(int fd, enum ncclSocketState* state) { + struct pollfd pfd; + int timeout = 1, ret; + socklen_t rlen = sizeof(int); + + memset(&pfd, 0, sizeof(struct pollfd)); + pfd.fd = fd; + pfd.events = POLLOUT; + SYSCHECK(ret = poll(&pfd, 1, timeout), "poll"); + if (ret == 0) { + ret = EINPROGRESS; + } else { + /* check socket status */ + EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0); + SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt"); + } + + if (ret == EINPROGRESS) + *state = ncclSocketConnecting; + else if (ret == 0) + *state = ncclSocketConnected; + else + *state = ncclSocketError; + return ncclSuccess; +} + +ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state) { + NCCLCHECK(getFdState(sock->fd, state)); + sock->state = *state; + return ncclSuccess; +} + +ncclResult_t ncclSocketConnect(struct ncclSocket* sock) { + char line[SOCKET_NAME_MAXLEN+1]; + /* IPv4/IPv6 support */ + int family = sock->addr.sa.sa_family; + if (family != AF_INET && family != AF_INET6) { + WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)", + ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6); + return ncclInternalError; + } + int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + int flags; + + /* Connect to a hostname / port */ + int fd = socket(family, SOCK_STREAM, 0); + if (fd == -1) { + WARN("Net : Socket creation failed : %s", strerror(errno)); + return ncclSystemError; + } + + const int one = 1; + SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); + + /* support non-blocking socket; by default, the socket is non-blocking */ + EQCHECK(flags = fcntl(fd, F_GETFL), -1); + SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); + + /* const int bufsize = 128*1024; + SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt"); + SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/ + + TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line)); + + int ret; + int timedout_retries = 0; + int refused_retries = 0; +retry: + /* async connect; abort when error happens and abortFlag is present. */ + ret = connect(fd, &sock->addr.sa, salen); + + if (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || + (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) { + if (refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno)); + usleep(SLEEP_INT); + goto retry; + } else if (errno == EINPROGRESS && !sock->asyncFlag) { + enum ncclSocketState state; + do { + if (sock->abortFlag) NEQCHECK(*sock->abortFlag, 0); + NCCLCHECK(getFdState(fd, &state)); + } while (state == ncclSocketConnecting); + EQCHECK(state, ncclSocketError); + ret = 0; + } + + if (ret == 0 || (errno == EINPROGRESS && sock->asyncFlag)) { + sock->fd = fd; + return ncclSuccess; + } + + WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno)); + return ncclSystemError; +} + +ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) { + socklen_t socklen = sizeof(union ncclSocketAddress); + int tmpFd = sock->fd = -1; + + do { + if (listenSocket->abortFlag) NEQCHECK(*listenSocket->abortFlag, 0); + tmpFd = accept(listenSocket->fd, &sock->addr.sa, &socklen); + } while ((errno == EAGAIN || errno == EWOULDBLOCK) && tmpFd == -1 && !listenSocket->asyncFlag); + + if (!listenSocket->asyncFlag) { + EQCHECK(tmpFd, -1); + } else if (tmpFd == -1 && errno != EAGAIN && errno != EWOULDBLOCK) { + return ncclSystemError; + } + + sock->fd = tmpFd; + return ncclSuccess; +} + +ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, volatile uint32_t* abortFlag, int asyncFlag) { + if (sock == NULL) + return ncclSuccess; + + sock->fd = -1; + if (addr) { + memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress)); + } else { + memset(&sock->addr, 0, sizeof(union ncclSocketAddress)); + } + sock->abortFlag = abortFlag; + sock->asyncFlag = asyncFlag; + sock->state = ncclSocketStateNum; + return ncclSuccess; +} + +static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) { + int bytes = 0; + *closed = 0; + char* data = (char*)ptr; + char line[SOCKET_NAME_MAXLEN+1]; + do { + if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); + if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); + if (op == NCCL_SOCKET_RECV && bytes == 0) { + *closed = 1; + return ncclSuccess; + } + if (bytes == -1) { + if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { + WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno)); + return ncclSystemError; + } else { + bytes = 0; + } + } + (*offset) += bytes; + if (sock->abortFlag && *sock->abortFlag != 0) { + INFO(NCCL_NET, "Socket progress: abort called"); + return ncclSystemError; + } + } while (bytes > 0 && (*offset) < size); + return ncclSuccess; +} + +ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { + int closed; + NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed)); + if (closed) { + char line[SOCKET_NAME_MAXLEN+1]; + WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0)); + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { + while (*offset < size) + NCCLCHECK(ncclSocketProgress(op, sock, ptr, size, offset)); + return ncclSuccess; +} + +ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) { + int offset = 0; + NCCLCHECK(ncclSocketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset)); + return ncclSuccess; +} + +ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) { + int offset = 0; + NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset)); + return ncclSuccess; +} + +// Receive or detect connection closed +ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) { + int offset = 0; + *closed = 0; + while (offset < size) { + NCCLCHECK(ncclSocketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed)); + if (*closed) return ncclSuccess; + } + return ncclSuccess; +} diff --git a/src/net.cc b/src/net.cc new file mode 100644 index 0000000000..934a6faef4 --- /dev/null +++ b/src/net.cc @@ -0,0 +1,265 @@ +#include "net.h" +#include "bootstrap.h" +#include "checks.h" + +#include +#include +#include +//#include +//#include +//#include + +ncclNet_t *ncclNet; +ncclCollNet_t *ncclCollNet; + +static ncclNet_v5_t ncclNet_v4_as_v5; +static ncclNet_v4_t *ncclNet_v4; +static ncclCollNet_v5_t ncclCollNet_v4_as_v5; +static ncclCollNet_v4_t *ncclCollNet_v4; + +static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) { + ncclNetProperties_v4_t p4; + ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4); + if (ans != ncclSuccess) return ans; + props->name = p4.name; + props->pciPath = p4.pciPath; + props->guid = p4.guid; + props->ptrSupport = p4.ptrSupport; + props->speed = p4.speed; + props->port = p4.port; + props->maxComms = p4.maxComms; + props->maxRecvs = 1; + props->latency = 0; + return ncclSuccess; +} + +static ncclResult_t ncclNet_v4_as_v5_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { + return ncclNet_v4->isend(sendComm, data, size, mhandle, request); +} + +static ncclResult_t ncclNet_v4_as_v5_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { + if (n == 0) return ncclSuccess; + if (n != 1) return ncclInvalidArgument; + return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request); +} + +static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { + if (n == 0) return ncclSuccess; + if (n != 1) return ncclInvalidArgument; + return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request); +} + +// We use a wrapper around the v4 init to copy over the struct contents +// post-init since they may not be initialized before hand. +static ncclResult_t ncclNet_v4_as_v5_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclNet_v4->init(logfn)); + ncclNet_v4_as_v5.name = ncclNet_v4->name; + ncclNet_v4_as_v5.devices = ncclNet_v4->devices; + ncclNet_v4_as_v5.getProperties = ncclNet_v4_as_v5_getProperties; + ncclNet_v4_as_v5.listen = ncclNet_v4->listen; + ncclNet_v4_as_v5.connect = ncclNet_v4->connect; + ncclNet_v4_as_v5.accept = ncclNet_v4->accept; + ncclNet_v4_as_v5.regMr = ncclNet_v4->regMr; + ncclNet_v4_as_v5.deregMr = ncclNet_v4->deregMr; + ncclNet_v4_as_v5.isend = ncclNet_v4_as_v5_isend; + ncclNet_v4_as_v5.irecv = ncclNet_v4_as_v5_irecv; + ncclNet_v4_as_v5.iflush = ncclNet_v4_as_v5_iflush; + ncclNet_v4_as_v5.test = ncclNet_v4->test; + ncclNet_v4_as_v5.closeSend = ncclNet_v4->closeSend; + ncclNet_v4_as_v5.closeRecv = ncclNet_v4->closeRecv; + ncclNet_v4_as_v5.closeListen = ncclNet_v4->closeListen; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) { + ncclNetProperties_v4_t p4; + ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4); + if (ans != ncclSuccess) return ans; + props->name = p4.name; + props->pciPath = p4.pciPath; + props->guid = p4.guid; + props->ptrSupport = p4.ptrSupport; + props->speed = p4.speed; + props->port = p4.port; + props->maxComms = p4.maxComms; + props->maxRecvs = 1; + props->latency = 0; + return ncclSuccess; +} + +// We use a wrapper around the v4 init to copy over the struct contents +// post-init since they may not be initialized before hand. +static ncclResult_t ncclCollNet_v4_as_v5_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v4->init(logfn)); + ncclCollNet_v4_as_v5.name = ncclCollNet_v4->name; + ncclCollNet_v4_as_v5.devices = ncclCollNet_v4->devices; + ncclCollNet_v4_as_v5.getProperties = ncclCollNet_v4_as_v5_getProperties; + ncclCollNet_v4_as_v5.listen = ncclCollNet_v4->listen; + ncclCollNet_v4_as_v5.connect = ncclCollNet_v4->connect; + ncclCollNet_v4_as_v5.reduceSupport = ncclCollNet_v4->reduceSupport; + ncclCollNet_v4_as_v5.regMr = ncclCollNet_v4->regMr; + ncclCollNet_v4_as_v5.deregMr = ncclCollNet_v4->deregMr; + ncclCollNet_v4_as_v5.iallreduce = ncclCollNet_v4->iallreduce; + ncclCollNet_v4_as_v5.iflush = ncclCollNet_v4->iflush; + ncclCollNet_v4_as_v5.test = ncclCollNet_v4->test; + ncclCollNet_v4_as_v5.closeColl = ncclCollNet_v4->closeColl; + ncclCollNet_v4_as_v5.closeListen = ncclCollNet_v4->closeListen; + return ncclSuccess; +} + +static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) { + char ncclNetPluginName[128]; + const char* envPluginName = getenv("NCCL_NET_PLUGIN"); + if (envPluginName && strlen(envPluginName)) { + snprintf(ncclNetPluginName, 128, "librccl-net-%s.so", envPluginName); + INFO(NCCL_INIT, "Plugin name set by env to %s", ncclNetPluginName); + } else { + sprintf(ncclNetPluginName, "librccl-net.so"); + } + void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL); + if (netPluginLib == nullptr) { + // dlopen does not guarantee to set errno, but dlerror only gives us a + // string, so checking errno doesn't hurt to try to provide a better + // error message + if (errno == ENOENT) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName); + } else { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror()); + } + return; + } + + *net = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); + if (*net == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v5 symbol."); + ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4"); + if (ncclNet_v4 == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v4 symbol."); + if (netPluginLib != nullptr) dlclose(netPluginLib); + return; + } + *net = &ncclNet_v4_as_v5; + ncclNet_v4_as_v5.init = ncclNet_v4_as_v5_init; + } + + // Check for CollNet + *collnet = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5"); + if (*collnet == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol."); + ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4"); + if (ncclCollNet_v4 == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol."); + } else { + *collnet = &ncclCollNet_v4_as_v5; + ncclCollNet_v4_as_v5.init = ncclCollNet_v4_as_v5_init; + } + } + return; +} + +ncclResult_t ncclNetInit() { + // Always initialize bootstrap network + NCCLCHECK(bootstrapNetInit()); + + // Initialize main communication network + ncclNet_t* nets[3] = { nullptr, &ncclNetIb, &ncclNetSocket }; + ncclCollNet_t* collNets[3] = { nullptr, nullptr, nullptr }; + initPlugin(&nets[0], &collNets[0]); + char* netName = getenv("NCCL_NET"); + bool ok = false; + + for (int i=0; i<3; i++) { + if (nets[i] == nullptr) continue; + if (netName && strcmp(netName, nets[i]->name) != 0) continue; + + // net plugin is already initialized + int ndev; + if (nets[i]->init(ncclDebugLog) != ncclSuccess) continue; + if (nets[i]->devices(&ndev) != ncclSuccess) continue; + if (ndev <= 0) continue; + ncclNet = nets[i]; + ok = true; + + if (collNets[i]) { + do { + if (collNets[i]->init(ncclDebugLog) != ncclSuccess) break; + if (collNets[i]->devices(&ndev) != ncclSuccess) break; + if (ndev <= 0) break; + ncclCollNet = collNets[i]; + } while(0); + } + break; + } + + if (!ok) { + WARN("Error: network %s not found.", netName ? netName : ""); + return ncclInvalidUsage; + } + return ncclSuccess; +} + +ncclResult_t ncclGpuGdrSupport(int* gdrSupport) { + constexpr int GPU_BUF_SIZE = 2*1024*1024; +#if CUDART_VERSION >= 11030 + // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute + int driverVersion; + CUDACHECK(cudaDriverGetVersion(&driverVersion)); + if (driverVersion >= 11030) { + int cudaDev, attr = 0; + CUDACHECK(cudaGetDevice(&cudaDev)); + CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev)); + *gdrSupport = attr; + return ncclSuccess; + } +#endif + int netDevs; + NCCLCHECK(ncclNetDevices(&netDevs)); + *gdrSupport = 0; + for (int dev=0; devproxyState; +static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) { struct ncclProxyArgs* elem; if (state->pool == NULL) { - // Check whether there are freed elements - if (state->poolReturned) { - pthread_mutex_lock(&state->poolMutex); - state->pool = state->poolReturned; - state->poolReturned = NULL; - pthread_mutex_unlock(&state->poolMutex); - } else { - // Allocate a new pool of elements. Make sure we allocate the memory close - // to the network thread - struct ncclProxyPool* newPool; - cpu_set_t affinitySave; - if (CPU_COUNT(&comm->cpuAffinity)) { - sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); - } - NCCLCHECK(ncclCalloc(&newPool, 1)); - if (CPU_COUNT(&comm->cpuAffinity)) { - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - } + // Allocate a new pool of elements. Make sure we allocate the memory close + // to the network thread + struct ncclProxyPool* newPool; + NCCLCHECK(ncclCalloc(&newPool, 1)); - struct ncclProxyArgs* newElems = newPool->elems; - // Chain newly allocated elements - for (int i=0; ipool = newElems; - // Save the pool memory block for later resource release - newPool->next = state->pools; - state->pools = newPool; + struct ncclProxyArgs* newElems = newPool->elems; + // Chain newly allocated elements + for (int i=0; ipool = newElems; + // Save the pool memory block for later resource release + newPool->next = state->pools; + state->pools = newPool; } elem = state->pool; state->pool = state->pool->next; @@ -83,241 +71,394 @@ static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** a #define OP_INDEX(op) ((op) ? (op)-state->pools->elems : -1) #define OP_SEEN 0x100000 -ncclResult_t dumpProxyState(struct ncclProxyState* state) { -#ifdef DEBUG_PROXY - struct ncclProxyArgs* op = state->ops; - while (op) { - if (op->idle & OP_SEEN) { - WARN("Active list loop at element %ld", OP_INDEX(op)); + +ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState* state, int* poolIndex, int* opIndex) { + struct ncclProxyPool* pool = state->pools; + int p = 0; + while (pool) { + uint64_t o = op-pool->elems; + if (o < PROXYARGS_ALLOCATE_SIZE) { + *opIndex = o; + *poolIndex = p; + return ncclSuccess; } - op->idle |= OP_SEEN; - printf("[%ld(%ld/%d)]", OP_INDEX(op), op->opCount, op->nsubs); - if (op->nextPeer) { - printf("(%ld)", OP_INDEX(op->nextPeer)); - struct ncclProxyArgs* n = op->nextPeer; - n->idle |= OP_SEEN; - while (n->nextPeer) { - n = n->nextPeer; - n->idle |= OP_SEEN; + pool = pool->next; + p++; + } + WARN("Could not find pool of op %p\n", op); + return ncclInternalError; +} + +ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex) { + printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, op->pattern == ncclPatternSend ? "Send" : op->pattern == ncclPatternRecv ? "Recv" : "Coll"); + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = op->subs+s; + if (op->state == ncclProxyOpProgress) { + char status = ' '; + if (op->pattern == ncclPatternRecv) { + if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init + else if (sub->received < sub->posted) status = 'R'; // Receiving + else if (sub->received < sub->transmitted) status = 'R'; // Receiving + else if (sub->transmitted < sub->received) status = 'F'; // Flushing + else if (sub->done < sub->transmitted) status = 'G'; // Waiting on GPU + else status = 'D'; // Done + } else if (op->pattern == ncclPatternSend) { + if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init + else if (sub->transmitted < sub->posted) status = 'G'; // Waiting on GPU + else if (sub->done < sub->transmitted) status = 'S'; // Sending + else status = 'D'; // Done } + printf(" %d%c/%d", sub->peer, status, sub->channelId); + } else { + printf(" %d/%d", sub->peer, sub->channelId); } + } + printf("]"); + return ncclSuccess; +} +ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) { + struct ncclProxyArgs* op = state->active; + int poolIndex, opIndex; + printf("ACTIVE OPS\n"); + while (op) { + NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); + if (op->state & OP_SEEN) { + WARN("List loop at element %d-%d", poolIndex, opIndex); + } + NCCLCHECK(printProxyOp(op, poolIndex, opIndex)); + op->state |= OP_SEEN; + printf("\n"); + struct ncclProxyArgs* nextOp = op->nextPeer; + while (nextOp) { + NCCLCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex)); + if (nextOp->state & OP_SEEN) { + WARN("List loop at element %d-%d", poolIndex, opIndex); + } + printf("| `-> "); + NCCLCHECK(printProxyOp(nextOp, poolIndex, opIndex)); + nextOp->state |= OP_SEEN; + printf("\n"); + if (nextOp->next) { + WARN("Inactive op has next set!\n"); + } + nextOp = nextOp->nextPeer; + } + if (op->nextPeer == NULL) printf("|\n"); + op = op->next; + printf("v\n"); + } + printf("[X]\n"); + +# if 0 + printf("FREE OPS\n"); + op = state->pool; + while (op) { + NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); + if (op->state & OP_SEEN) { + WARN("List loop at element %d-%d", poolIndex, opIndex); + } + NCCLCHECK(printProxyOp(op, poolIndex, opIndex)); + op->state |= OP_SEEN; printf("->"); op = op->next; } printf("[X]\n"); - - struct ncclProxyArgs* free = state->pool; - while (free) { - if (free->idle & OP_SEEN) { - WARN("Free list loop at element %ld", OP_INDEX(free)); +#else + op = state->pool; + while (op) { + NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); + if (op->state & OP_SEEN) { + WARN("List loop at element %d-%d", poolIndex, opIndex); } - free->idle |= OP_SEEN; - free = free->next; - } - - struct ncclProxyPool* p = state->pools; - int i = 0; - while (p) { - for (int e=0; eelems[e].idle & OP_SEEN) == 0) { - WARN("Element %d of pool %d has been lost", e, i); - struct ncclProxyArgs* free = state->pool; - printf("Free list "); - while (free) { - printf("--> %ld ", OP_INDEX(free)); - free = free->next; - } - printf("\n"); - return ncclInternalError; - } - p->elems[e].idle -= OP_SEEN; - } - p = p->next; - i++; + op->state |= OP_SEEN; + op = op->next; } #endif + + struct ncclProxyPool* pool = state->pools; + poolIndex = 0; + while (pool) { + struct ncclProxyArgs* elem = pool->elems; + for (int e=0; estate & OP_SEEN) == 0) { + printf("Elem %d-%d is not in any list:\n", poolIndex, e); + NCCLCHECK(printProxyOp(elem, poolIndex, e)); + printf("\n"); + } else { + elem->state -= OP_SEEN; + } + } + pool = pool->next; + poolIndex++; + } return ncclSuccess; } -static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args) { - struct ncclProxyArgs* proxyAppend = *args->proxyAppendPtr; - int shared = args->subs[0].connector->conn.shared; - if (proxyAppend) { - if (shared && proxyAppend->opCount == args->opCount) { - if ((proxyAppend->sliceSteps != args->sliceSteps) || - (proxyAppend->chunkSteps != args->chunkSteps) || - (proxyAppend->protocol != args->protocol) || - (proxyAppend->dtype != args->dtype) || - (proxyAppend->redOp != args->redOp)) { - WARN("Proxy append mismatch"); - return ncclInternalError; - } - if (proxyAppend->nsubs >= NCCL_PROXY_MAX_SUBS) { - WARN("Proxy append out of bound"); - return ncclInternalError; - } - memcpy(proxyAppend->subs+proxyAppend->nsubs, args->subs, sizeof(struct ncclProxySubArgs)); - proxyAppend->nsubs++; - args->next = proxyAppend->next; - // Free args as we merged them - args->next = state->poolFreed; - state->poolFreed = args; - DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as group with %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend)); +static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyArgs* args, int subIndex) { + struct ncclProxySubArgs* sub = args->subs+subIndex; + if (subIndex >= NCCL_PROXY_MAX_SUBS) { + WARN("Proxy append out of bounds"); + return ncclInternalError; + } + + //memset(sub, 0, sizeof(struct ncclProxySubArgs)); + sub->connection = op->connection; + sub->channelId = op->channelId; + sub->nsteps = op->nsteps; + sub->nbytes = op->nbytes; + sub->peer = op->root; + args->nsubs = subIndex+1; + if (subIndex) { + if ((args->sliceSteps != op->sliceSteps) || + (args->chunkSteps != op->chunkSteps) || + (args->protocol != op->protocol) || + (args->dtype != op->dtype) || + (args->redOp != op->redOp)) { + WARN("Proxy append mismatch"); + return ncclInternalError; + } + if (args->state != ncclProxyOpReady) { + WARN("Proxy append on running operation"); + return ncclInternalError; + } + return ncclSuccess; + } + //memset(&args->progress, 0, sizeof(struct ncclProxyArgs)-offsetof(struct ncclProxyArgs, progress)); + args->done = 0; + args->opCount = op->opCount; + args->sliceSteps = op->sliceSteps; + args->chunkSteps = op->chunkSteps; + args->chunkSize = op->chunkSize; + args->dtype = op->dtype; + args->redOp = op->redOp; + args->pattern = op->pattern; + args->protocol = op->protocol; + args->state = ncclProxyOpReady; + args->progress = op->connection->tcomm->proxyProgress; + args->proxyAppendPtr = op->connection->proxyAppendPtr; + return ncclSuccess; +} + +static ncclResult_t ProxyAppend(struct ncclProxyProgressState* state, struct ncclProxyOp* op) { + struct ncclProxyConnection* connection = op->connection; + int shared = connection->shared; + struct ncclProxyArgs* args = *connection->proxyAppendPtr; + + if (args) { + if (shared && args->opCount == op->opCount) { + NCCLCHECK(ncclProxyOpToArgs(op, args, args->nsubs)); + DEBUG_PROXY_PRINT("Insert (%d/%5ld/%5ld) as group with %5ld\n", shared, args->opCount, op->opCount, OP_INDEX(args)); } else { - proxyAppend->nextPeer = args; - DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend)); + struct ncclProxyArgs* prevArgs = args; + NCCLCHECK(allocateArgs(state, &args)); + NCCLCHECK(ncclProxyOpToArgs(op, args, 0)); + prevArgs->nextPeer = args; + DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, prevArgs->opCount, args->opCount, OP_INDEX(prevArgs)); *(args->proxyAppendPtr) = args; } } else { // Nothing running for that peer. Add to the list - if (state->ops == NULL) { + NCCLCHECK(allocateArgs(state, &args)); + NCCLCHECK(ncclProxyOpToArgs(op, args, 0)); + if (state->active == NULL) { // Create the list DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element\n", OP_INDEX(args), shared, args->opCount); - state->ops = args; + state->active = args; } else { // Append element at the end of the list - struct ncclProxyArgs* last = state->ops; + struct ncclProxyArgs* last = state->active; while (last->next) last = last->next; last->next = args; - DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element\n", OP_INDEX(args),shared, args->opCount); + DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element\n", OP_INDEX(args), shared, args->opCount); } *(args->proxyAppendPtr) = args; } return ncclSuccess; } -static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args, int connIndex) { - if (peer < 0) return ncclSuccess; - - struct ncclChannel* channel = args->subs[0].channel; - struct ncclPeer* peerComm = channel->peers+peer; - struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex; - if (connector->transportComm == NULL) { - WARN("Rank %d has no transport for %s peer %d on channel %d", connector->comm->rank, - type == proxyRecv ? "recv" : "send", peer, channel->id); - return ncclInternalError; +ncclResult_t ncclProxyPost(struct ncclProxyOpsPool* pool, int nextOps, int nextOpsEnd) { + pthread_mutex_lock(&pool->mutex); + if (pool->nextOps == -1) { + pool->nextOps = nextOps; + pthread_cond_signal(&pool->cond); + } else { + pool->ops[pool->nextOpsEnd].next = nextOps; } - if (connector->transportComm->proxy == NULL) return ncclSuccess; - - struct ncclProxyState* state = &connector->comm->proxyState; - struct ncclProxyArgs* op; - NCCLCHECK(allocateArgs(connector->comm, &op)); - memcpy(op, args, sizeof(struct ncclProxyArgs)); - op->subs[0].connector = connector; - op->progress = connector->transportComm->proxy; - op->state = ncclProxyOpReady; - op->proxyAppendPtr = connector->proxyAppendPtr; - - if (state->nextOps == NULL) state->nextOps = op; - else state->nextOpsEnd->next = op; - state->nextOpsEnd = op; + pool->nextOpsEnd = nextOpsEnd; + pthread_mutex_unlock(&pool->mutex); return ncclSuccess; } -ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks) { - struct ncclChannel* channel = args->subs[0].channel; - int pattern = args->pattern; +ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, struct ncclProxyOp* proxyOp) { + struct ncclProxyOps* proxyOps = proxyConn->comm->proxyState.proxyOps; + if (proxyOps == NULL) return ncclInternalError; + proxyOps += proxyConn->localRank; + struct ncclProxyOpsPool* pool = proxyOps->pool; + + TIME_START(0); + int opIndex = proxyOps->freeOp; + struct ncclProxyOp* op; + if (opIndex != -1) { + op = pool->ops+opIndex; + proxyOps->freeOp = op->next; + } else { + int freeOp; + while ((freeOp = pool->freeOps[comm->localRank]) == -1) sched_yield(); + int freeOpNew; + while ((freeOpNew = __sync_val_compare_and_swap(pool->freeOps+comm->localRank, freeOp, -1)) != freeOp) freeOp = freeOpNew; + opIndex = freeOp; + op = pool->ops+opIndex; + proxyOps->freeOp = op->next; + } + if (op->next != -1) __builtin_prefetch(pool->ops+op->next); // Prefetch next free op + memcpy(op, proxyOp, sizeof(struct ncclProxyOp)); + op->next = -1; + op->connection = proxyConn->connection; + if (proxyOps->nextOps == -1) { + proxyOps->nextOps = proxyOps->nextOpsEnd = opIndex; + } else { + pool->ops[proxyOps->nextOpsEnd].next = opIndex; + proxyOps->nextOpsEnd = opIndex; + } + if (++proxyOps->count == MAX_OPS_PER_PEER) { + // Post what we have so far to free some ops in the pool + // Do not post last operations as we could have more coming with the same opCount, and posting + // them in different batches would break proxyArgs aggregation with subs. + uint64_t lastOpCount = pool->ops[proxyOps->nextOpsEnd].opCount; + int lastOp = -1; + int toSend = 0; + int ops = 0; + for (int op= proxyOps->nextOps; op != proxyOps->nextOpsEnd; op=pool->ops[op].next) { + ops++; + if (pool->ops[op].opCount != lastOpCount) { + lastOp = op; + toSend = ops; + } + } + if (lastOp == -1) { + WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)\n", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount); + return ncclInternalError; + } + // Cut chain at lastOp + int nextOps = proxyOps->nextOps; + proxyOps->nextOps = pool->ops[lastOp].next; + pool->ops[lastOp].next = -1; + NCCLCHECK(ncclProxyPost(proxyOps->pool, nextOps, lastOp)); + proxyOps->count -= toSend; + } + TIME_STOP(0); + return ncclSuccess; +} + +static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) { + if (peer < 0) return ncclSuccess; + + struct ncclPeer* peerComm = channel->peers+peer; + struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex; + if (connector->transportComm == NULL) { + WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank, + type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex); + return ncclInternalError; + } + if (connector->transportComm->proxyProgress == NULL) return ncclSuccess; + + NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op)); + return ncclSuccess; +} + +ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* op, int nranks) { + struct ncclChannel* channel = comm->channels+op->channelId; + int pattern = op->pattern; if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) { struct ncclRing* ring = &channel->ring; - if (NeedProxy(proxyRecv, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args, args->connIndex)); - if (NeedProxy(proxySend, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args, args->connIndex)); + if (NeedProxy(proxyRecv, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, op->connIndex)); + if (NeedProxy(proxySend, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, op->connIndex)); } if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) { // Tree up struct ncclTree* tree = &channel->tree; - for (int i=0; idown[i], args, 0)); - NCCLCHECK(SaveProxy(proxySend, tree->up, args, 0)); + for (int i=0; idown[i], op, 0)); + NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0)); } if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) { // Tree down struct ncclTree* tree = &channel->tree; - for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args, 0)); - NCCLCHECK(SaveProxy(proxyRecv, tree->up, args, 0)); + for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0)); + NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0)); } if (pattern == ncclPatternCollTreeUpDown) { // CollTree up - NCCLCHECK(SaveProxy(proxySend, channel->collTree.out, args, 1)); // For CollTree up, we are using push + NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1)); // For CollTree up, we are using push // CollTree down - NCCLCHECK(SaveProxy(proxyRecv, channel->collTree.out, args, 0)); + NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0)); } return ncclSuccess; } -ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args) { - memset(args, 0, sizeof(struct ncclProxyArgs)); - int channelId = info->channelId; - args->nsubs = 1; - struct ncclProxySubArgs* sub = args->subs; +NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0); +ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) { + memset(op, 0, sizeof(struct ncclProxyOp)); + int channelId = info->channelId; struct ncclChannel* channel = info->comm->channels+channelId; - sub->channel = channel; - args->sliceSteps = 1; - args->chunkSteps = 1; - args->protocol = NCCL_PROTO_SIMPLE; - args->dtype = info->datatype; - sub->delta = info->delta; - sub->recvbytes = info->recvbytes; - sub->sendbytes = info->sendbytes; + op->channelId = channelId; + op->sliceSteps = 1; + op->chunkSteps = 1; + op->protocol = NCCL_PROTO_SIMPLE; + op->dtype = info->datatype; + op->connIndex = info->connIndex; int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR; - info->recvChunkSize = stepSize; - info->sendChunkSize = stepSize; + info->chunkSize = stepSize; + op->root = info->root; + op->nbytes = info->count; + struct ncclPeer* peer = channel->peers + op->root; - if (info->delta > 0 && info->recvbytes >= 0) { - int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks; - if (channel->peers[peerrecv].recv[0].transportComm && channel->peers[peerrecv].recv[0].transportComm->proxy) { + if (info->coll == ncclFuncSend) { + op->pattern = ncclPatternSend; + if (op->root != info->comm->rank && peer->send[info->connIndex].transportComm && peer->send[info->connIndex].transportComm->proxyProgress) { // Tune chunk size for the network - if (info->recvbytes < stepSize) info->recvChunkSize /= 4; - else if (info->recvbytes < 8*stepSize) info->recvChunkSize /= 2; + if (info->count < stepSize) info->chunkSize /= 4; + else if (info->count < 8*stepSize) info->chunkSize /= 2; } - sub->recvChunkSize = info->recvChunkSize; + } else if (info->coll == ncclFuncRecv) { + op->pattern = ncclPatternRecv; + if (op->root != info->comm->rank && peer->recv[info->connIndex].transportComm && peer->recv[info->connIndex].transportComm->proxyProgress) { + // Tune chunk size for the network + if (info->count < stepSize) info->chunkSize /= 4; + else if (info->count < 8*stepSize) info->chunkSize /= 2; + } + } else { + WARN("P2p operation is neither send or recv"); + return ncclInternalError; } - if (info->delta > 0 && info->sendbytes >= 0) { - int peersend = (info->comm->rank+info->delta)%info->comm->nRanks; - if (channel->peers[peersend].send[0].transportComm && channel->peers[peersend].send[0].transportComm->proxy) { - // Tune chunk size for the network - if (info->sendbytes < stepSize) info->sendChunkSize /= 4; - else if (info->sendbytes < 8*stepSize) info->sendChunkSize /= 2; - } - sub->sendChunkSize = info->sendChunkSize; + if (ncclParamChunkSize() != 0) { + info->chunkSize = ncclParamChunkSize(); + } + op->chunkSize = info->chunkSize; + return ncclSuccess; +} + +ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* op) { + struct ncclChannel* channel = comm->channels+op->channelId; + op->opCount = channel->workFifoTail-1; + if (op->root == comm->rank) return ncclSuccess; + if (op->pattern == ncclPatternRecv) { + op->nsteps = DIVUP(op->nbytes, op->chunkSize); + if (op->nsteps == 0) op->nsteps = 1; + NCCLCHECK(SaveProxy(channel, proxyRecv, op->root, op, op->connIndex)); + } else if (op->pattern == ncclPatternSend) { + op->nsteps = DIVUP(op->nbytes, op->chunkSize); + if (op->nsteps == 0) op->nsteps = 1; + NCCLCHECK(SaveProxy(channel, proxySend, op->root, op, op->connIndex)); } return ncclSuccess; } -ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args) { - struct ncclProxySubArgs* sub = args->subs; - struct ncclChannel* channel = sub->channel; - args->opCount = channel->workFifoTail-1; - args->commOpCount = comm->opCount; - const ssize_t recvbytesOrig = sub->recvbytes; - const ssize_t sendbytesOrig = sub->sendbytes; - if (sub->delta > 0 && recvbytesOrig >= ssize_t(0)) { - int peerrecv = (comm->nRanks+comm->rank-sub->delta)%comm->nRanks; - sub->recvbytes = recvbytesOrig; - sub->sendbytes = 0; - sub->nsteps = DIVUP(sub->recvbytes, sub->recvChunkSize); - if (sub->nsteps == 0) sub->nsteps = 1; - NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, args->recvIdx)); - } - if (sub->delta > 0 && sendbytesOrig >= ssize_t(0)) { - int peersend = (comm->rank+sub->delta)%comm->nRanks; - sub->sendbytes = sendbytesOrig; - sub->recvbytes = 0; - sub->nsteps = DIVUP(sub->sendbytes, sub->sendChunkSize); - if (sub->nsteps == 0) sub->nsteps = 1; - NCCLCHECK(SaveProxy(proxySend, peersend, args, args->sendIdx)); - } - // Reset proxy args for potentially multiple cuda graph launches - // It is safe as long as SaveProxy copies contents of args to op - sub->recvbytes = recvbytesOrig; - sub->sendbytes = sendbytesOrig; - return ncclSuccess; -} - -static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) { +static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) { struct ncclProxyArgs* freeOp = *opPtr; - DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next)); struct ncclProxyArgs* next = freeOp->next; + DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(next)); *opPtr = next; if (freeOp->nextPeer) { // replace op by nextPeer @@ -325,7 +466,7 @@ static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs* if (*prevOpPtr) { (*prevOpPtr)->next = nextPeer; } else { - state->ops = nextPeer; + state->active = nextPeer; } nextPeer->next = next; *(prevOpPtr) = nextPeer; @@ -334,25 +475,31 @@ static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs* if (*prevOpPtr) { (*prevOpPtr)->next = next; } else { - state->ops = next; + state->active = next; } } - freeOp->next = state->poolFreed; - state->poolFreed = freeOp; - DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr)); + freeOp->next = state->pool; + state->pool = freeOp; + DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr)); +#ifdef DEBUG_PROXY NCCLCHECK(dumpProxyState(state)); +#endif return ncclSuccess; } -static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyArgs** opsPtr, int* idle, struct ncclComm* comm) { +static ncclResult_t progressOps(struct ncclComm* comm, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) { struct ncclProxyArgs* prevOp = NULL; - struct ncclProxyArgs* op = *opsPtr; + struct ncclProxyArgs* op = opStart; while (op) { if (op->state == ncclProxyOpNone) return ncclInternalError; - NCCLCHECK(op->progress(op)); + TIME_START(0); TIME_START(1); + NCCLCHECK(op->progress(comm, op)); + if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); } *idle &= op->idle; if (op->state == ncclProxyOpNone) { + TIME_START(2); NCCLCHECK(removeOp(state, &op, &prevOp)); + TIME_STOP(2); } else { prevOp = op; op = op->next; @@ -361,197 +508,606 @@ static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyAr return ncclSuccess; } -ncclResult_t ncclProxyAppendPosted(struct ncclProxyState* state) { - // Return any freed element first - if (state->poolFreed) { - struct ncclProxyArgs* end = state->poolFreed; - while (end->next) end = end->next; - pthread_mutex_lock(&state->poolMutex); - end->next = state->poolReturned; - state->poolReturned = state->poolFreed; - pthread_mutex_unlock(&state->poolMutex); - state->poolFreed = NULL; +static ncclResult_t ncclProxyGetPostedOps(struct ncclComm* comm, int* added) { + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + if (state->opsPool == NULL) return ncclInternalError; + struct ncclProxyOpsPool* pool = state->opsPool; + + struct ncclProxyArgs profArgs; // Only used for profiling purposes + if (state->nextOps != -1) goto process_nextops; + + // If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock + // to be available. Exit, continue progress, and come back later. + if (state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0)) return ncclSuccess; + + if (state->active == NULL) { + pthread_mutex_lock(&pool->mutex); + while (pool->nextOps == -1 && !state->stop) { + struct ncclProxyArgs profArgs; // Only used for profiling purposes + ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep); + pthread_cond_wait(&pool->cond, &pool->mutex); + ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup); + } + if (state->stop) { // We might have been woken up to stop. + pthread_mutex_unlock(&pool->mutex); + return ncclSuccess; + } } - // Then wait until we have new work to do - pthread_mutex_lock(&state->opsMutex); - while (state->postedOps == NULL) { - if (state->stop) return ncclSuccess; - pthread_cond_wait(&state->cond, &state->opsMutex); + state->nextOps = pool->nextOps; + pool->nextOps = pool->nextOpsEnd = -1; + pthread_mutex_unlock(&pool->mutex); + if (state->nextOps == -1) return ncclInternalError; + +process_nextops: + ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppend); + TIME_START(2); + int freeOp[NCCL_MAX_LOCAL_RANKS]; + int freeOpEnd[NCCL_MAX_LOCAL_RANKS]; + for (int i=0; ilocalRanks; i++) freeOp[i] = -1; + + for (int opIndex = state->nextOps; opIndex != -1;) { + struct ncclProxyOp* peerOp = pool->ops+opIndex; + int peer = opIndex / MAX_OPS_PER_PEER; + if (peerOp->connection == NULL) return ncclInternalError; + if (peerOp->next != -1) __builtin_prefetch(pool->ops+peerOp->next); + NCCLCHECK(ProxyAppend(state, peerOp)); + (*added)++; + int lastOpIndex = opIndex; + opIndex = peerOp->next; + // Return op to peer pool + if (freeOp[peer] == -1) { + freeOpEnd[peer] = lastOpIndex; + } else { + peerOp->next = freeOp[peer]; + } + freeOp[peer] = lastOpIndex; + state->nextOps = opIndex; } - // Sort operations as we append them : collectives and - // receives first, then sends. - - struct ncclProxyArgs* next, *prev = NULL, *op = state->postedOps; - int commOpCount = op->commOpCount; - while (op && op->commOpCount == commOpCount) { - next = op->next; - if (op->subs[0].sendbytes) { - if (prev) prev->next = next; - else state->postedOps = next; - op->next = NULL; - NCCLCHECK(ProxyAppend(state, op)); - } else prev = op; - op = next; + for (int i=0; ilocalRanks; i++) { + if (freeOp[i] == -1) continue; + int newFree = freeOp[i]; + int oldFree = pool->freeOps[i]; + pool->ops[freeOpEnd[i]].next = oldFree; + if (oldFree == -1) { + // Nothing for the main thread to consume, we can set it. + pool->freeOps[i] = newFree; + } else { + // The main thread may recycle free ops at any time, replace the freeOps value atomically and check it worked. + int swap = __sync_val_compare_and_swap(pool->freeOps+i, oldFree, newFree); + if (swap != oldFree) { + if (swap != -1) return ncclInternalError; + // Ops were recycled while we were trying to swap, just set the value directly now. + pool->ops[freeOpEnd[i]].next = -1; + pool->freeOps[i] = newFree; + } + } } - op = state->postedOps; - while (op && op->commOpCount == commOpCount) { - next = op->next; - op->next = NULL; - NCCLCHECK(ProxyAppend(state, op)); - op = next; - } - state->postedOps = op; - if (op == NULL) state->postedOpsEnd = NULL; - NCCLCHECK(dumpProxyState(state)); - pthread_mutex_unlock(&state->opsMutex); - - if (state->poolFreed) { - struct ncclProxyArgs* end = state->poolFreed; - while (end->next) end = end->next; - pthread_mutex_lock(&state->poolMutex); - end->next = state->poolReturned; - state->poolReturned = state->poolFreed; - pthread_mutex_unlock(&state->poolMutex); - state->poolFreed = NULL; - } - + profArgs.opCount = *added; + ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppendEnd); + TIME_STOP(2); return ncclSuccess; } +#include +static ncclProxyProgressState* ncclLastProxyState; +void ncclDumpProxyState(int signal) { + dumpProxyState(ncclLastProxyState); +} -void* persistentThread(void *comm_) { +void* ncclProxyProgress(void *comm_) { struct ncclComm* comm = (struct ncclComm*)comm_; - struct ncclProxyState* state = &comm->proxyState; - char threadName[16]; - sprintf(threadName, "NCCLproxy %5d", comm->rank); + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + state->nextOps = -1; + signal(SIGUSR1, ncclDumpProxyState); + ncclLastProxyState = state; + char threadName[NCCL_THREAD_NAMELEN]; + snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", comm->cudaDev); nvtxNameOsThreadA(syscall(SYS_gettid), threadName); - struct ncclProxyArgs** opsPtr = &state->ops; - while (1) { - if (*comm->abortFlag) { - return NULL; - } - - while (*opsPtr == NULL) { - if (state->stop) { - // No more commands to process and proxy has been requested to stop - return NULL; - } - ncclResult_t ret = ncclProxyAppendPosted(state); - if (ret != ncclSuccess) { - comm->fatalError = ret; - INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); - return NULL; - } - } + int lastIdle = 0; + struct ncclProxyArgs profArgs; // Only used for profiling purposes + while (state->stop == 0 && *comm->abortFlag == 0) { int idle = 1; - ncclResult_t ret = progressOps(state, opsPtr, &idle, comm); + ncclResult_t ret = progressOps(comm, state, state->active, &idle); if (ret != ncclSuccess) { comm->fatalError = ret; INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); return NULL; } + if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle); + if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive); if (idle) { - sched_yield(); // No request progressed. Let others run. + int added = 0; + TIME_START(3); + ret = ncclProxyGetPostedOps(comm, &added); + if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); } + if (ret != ncclSuccess) { + comm->fatalError = ret; + INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); + } + if (added == 0) { + sched_yield(); // No request progressed. Let others run. + } } + lastIdle = idle; } + return NULL; } ncclResult_t ncclProxyStart(struct ncclComm* comm) { - struct ncclProxyState* state = &comm->proxyState; - if (state->nextOps == NULL) return ncclSuccess; - pthread_mutex_lock(&state->opsMutex); - if (state->postedOps) state->postedOpsEnd->next = state->nextOps; - else state->postedOps = state->nextOps; - state->postedOpsEnd = state->nextOpsEnd; - state->nextOps = state->nextOpsEnd = NULL; - pthread_cond_signal(&state->cond); - pthread_mutex_unlock(&state->opsMutex); + struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps; + if (proxyOps == NULL) return ncclSuccess; + TIME_START(1); + for (int r=0; rlocalRanks; r++) { + struct ncclProxyOps* ops = proxyOps+r; + if (ops->pool == NULL || ops->nextOps == -1) continue; + NCCLCHECK(ncclProxyPost(ops->pool, ops->nextOps, ops->nextOpsEnd)); + ops->nextOps = ops->nextOpsEnd = -1; + ops->count = 0; + } comm->opCount++; + TIME_STOP(1); return ncclSuccess; } -ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr) { - struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs; - if (state->size == 0) { - int p2pnChannels = 1; - while (p2pnChannels < comm->nChannels) p2pnChannels *= 2; - int p2pSize = 2*p2pnChannels*NCCL_MAX_WORK_ELEMENTS*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR; - int collNetSize = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE]; - state->size = std::max(p2pSize, collNetSize); +ncclResult_t ncclProxyProgressCreate(struct ncclComm* comm) { + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + if (!state->thread) { + pthread_create(&state->thread, NULL, ncclProxyProgress, comm); + ncclSetThreadName(state->thread, "NCCL Progress%2d", comm->cudaDev); + } + return ncclSuccess; +} + +ncclResult_t ncclProxyProgressDestroy(struct ncclComm* comm) { + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + + // Request the proxy to stop and then wake it + if (state->opsPool) { + pthread_mutex_lock(&state->opsPool->mutex); + state->stop = true; + pthread_cond_signal(&state->opsPool->cond); + pthread_mutex_unlock(&state->opsPool->mutex); + pthread_join(state->thread, NULL); } - *size = state->size; - - if (cuda && state->cudaBuff == NULL) { - NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size, cuda)); - } else if (state->hostBuff == NULL) { - NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size)); + // Free off any memory allocated for the proxy arg pools + while (state->pools != NULL) { + struct ncclProxyPool *next = state->pools->next; + free(state->pools); + state->pools = next; } - *ptr = cuda ? state->cudaBuff : state->hostBuff; + + ncclProfilingDump(); + TIME_PRINT("Proxy"); return ncclSuccess; } -ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr) { - struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs; - // Use different pools for separate send/recv. - char* buff = cuda ? state->cudaBuff : state->hostBuff; - int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR); - int globalSlot = (((type*comm->p2pnChannels+channel)*NCCL_STEPS)+slot)*NCCL_MAX_WORK_ELEMENTS+index; - *ptr = buff + slotSize * globalSlot; - return ncclSuccess; -} -ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr) { - struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs; - // Use different pools for different channels. - char* buff = cuda ? state->cudaBuff : state->hostBuff; - int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; - int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel; - *ptr = buff + slotSize * globalSlot; +struct ncclProxyAsyncOp { + int type; + struct ncclProxyConnection* connection; + int reqSize, respSize; + char *reqBuff, *respBuff; +}; + +struct ncclProxyLocalPeer { + struct ncclSocket sock; + int localRank; + struct ncclProxyAsyncOp asyncOps; +}; + +#define NCCL_PROXY_CONN_POOL_SIZE_POW2 7 +#define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2)) +#define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1) +struct ncclProxyConnectionPool { + struct ncclProxyConnection** pools; + int banks; + int offset; + struct ncclProxyAsyncOp* ops; +}; + +static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) { + if (pool->offset == NCCL_PROXY_CONN_POOL_SIZE) { + NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1)); + NCCLCHECK(ncclCalloc(pool->pools+pool->banks, NCCL_PROXY_CONN_POOL_SIZE)); + pool->banks++; + pool->offset = 0; + } + *id = ((pool->banks-1) << NCCL_PROXY_CONN_POOL_SIZE_POW2) + pool->offset; + pool->offset++; return ncclSuccess; } -ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm) { - struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs; - CUDACHECK(hipFree(state->cudaBuff)); - NCCLCHECK(ncclCudaHostFree(state->hostBuff)); +static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool, int id, struct ncclProxyConnection** conn) { + int bank = id>>NCCL_PROXY_CONN_POOL_SIZE_POW2; + int offset = id&NCCL_PROXY_CONN_POOL_MASK; + if ((pool->pools == NULL) || (bank > pool->banks) || (pool->pools[bank] == NULL)) return ncclInternalError; + *conn = pool->pools[bank]+offset; + return ncclSuccess; +} + +static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + if (connection->send) { + NCCLCHECK(ncclTransports[connection->transport].send.proxyFree(connection, comm)); + } else { + NCCLCHECK(ncclTransports[connection->transport].recv.proxyFree(connection, comm)); + } + return ncclSuccess; +} + +static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* pool, struct ncclComm* comm) { + for (int b=0; bbanks; b++) { + int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE; + for (int i=0; ipools[b]+i, comm)); + } + free(pool->pools[b]); + } + free(pool->pools); + return ncclSuccess; +} + +#include "transport.h" + +ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn) { + // Keep one connection per mlocal rank + proxyConn->connection = NULL; + proxyConn->rank = rank; + if (comm->proxyState.peerSocks == NULL) { + NCCLCHECK(ncclCalloc(&comm->proxyState.peerSocks, comm->localRanks)); + NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks)); + NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks)); + for (int r=0; rlocalRanks; r++) { + comm->proxyState.peerSocks[r].fd = -1; + comm->proxyState.peerSocks[r].abortFlag = comm->abortFlag; + } + } + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank)); + struct ncclSocket* sock = comm->proxyState.peerSocks+proxyConn->localRank; + if (sock->fd == -1) { + memcpy(&sock->addr, comm->proxyState.peerAddresses+rank, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketConnect(sock)); + } + int type = ncclProxyMsgInit; + NCCLCHECK(ncclSocketSend(sock, &type, sizeof(int))); + NCCLCHECK(ncclSocketSend(sock, &transport, sizeof(int))); + NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int))); + NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int))); + NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*))); + struct ncclTransportComm* tcomm = send ? &ncclTransports[transport].send : &ncclTransports[transport].recv; + // If we need proxy progress, map progress ops + if (tcomm->proxyProgress) { + char poolPath[] = "/dev/shm/nccl-XXXXXX"; + NCCLCHECK(ncclSocketRecv(sock, poolPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1)); + struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps+proxyConn->localRank; + if (proxyOps->pool == NULL) { + NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0)); + proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1; + } + } + INFO(NCCL_NET, "Connection to proxy localRank %d -> connection %p", proxyConn->localRank, proxyConn->connection); + proxyConn->comm = comm; + return ncclSuccess; +} + +const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" }; +ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) { + if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError; + struct ncclSocket* sock = proxyConn->comm->proxyState.peerSocks+proxyConn->localRank; + if (sock->fd == -1) return ncclInternalError; + ncclResult_t ret; + + NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error); + NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error); + NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error); + NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error); + if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error); + if (respSize) NCCLCHECKGOTO(ncclSocketRecv(sock, respBuff, respSize), ret, error); + return ncclSuccess; +error: + WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]); + sock->fd = -1; + return ret; +} + +static ncclResult_t proxyProgressInit(struct ncclComm* comm) { + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + if (state->opsPool == NULL) { + int size = sizeof(struct ncclProxyOpsPool); + struct ncclProxyOpsPool* pool = NULL; + + char shmPath[sizeof("/dev/shm/nccl-XXXXXX")]; + shmPath[0] = '\0'; + NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, 1)); + + // Init pool + pool->nextOps = -1; + + // The service thread may be launched already but localRanks may not be set yet. + while (comm->localRanks == 0) sched_yield(); + + for (int r=0; rlocalRanks; r++) { + pool->freeOps[r] = r*MAX_OPS_PER_PEER; + for (int i=0; iops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1; + pool->ops[(r+1)*MAX_OPS_PER_PEER-1].next = -1; + } + + // Setup mutex/cond to work inter-process + pthread_mutexattr_t mutexAttr; + pthread_mutexattr_init(&mutexAttr); + pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED); + pthread_mutex_init(&pool->mutex, &mutexAttr); + pthread_condattr_t condAttr; + pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED); + pthread_cond_init(&pool->cond, &condAttr); + state->opsPool = pool; + + memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1); + + // All ops structures are created, we can start the progress thread + NCCLCHECK(ncclProxyProgressCreate(comm)); + } + return ncclSuccess; +} + +static void proxyOpsFree(struct ncclComm* comm) { + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + if (ncclShmClose(state->opsPool, NULL, sizeof(struct ncclProxyOpsPool)) != ncclSuccess) { + WARN("[Service thread] shm close failed"); + } +} + +ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) { + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + if (state->opsPool == NULL) return ncclSuccess; + + char shmPath[] = "/dev/shm/nccl-XXXXXX"; + memcpy(shmPath+sizeof("/dev/shm/nccl-")-1, state->opsPoolShmSuffix, sizeof("XXXXXX")-1); + if (ncclShmUnlink(shmPath) != ncclSuccess) { + WARN("[Service thread] shm unlink failed"); + } + return ncclSuccess; +} + +static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) { + struct ncclSocket* sock = &peer->sock; + int id; + struct ncclProxyConnection* connection; + NCCLCHECK(ncclProxyNewConnection(connectionPool, &id)); + NCCLCHECK(ncclProxyGetConnection(connectionPool, id, &connection)); + connection->sock = sock; + NCCLCHECK(ncclSocketRecv(sock, &connection->transport, sizeof(int))); + NCCLCHECK(ncclSocketRecv(sock, &connection->send, sizeof(int))); + NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int))); + connection->localRank = peer->localRank; + NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*))); + connection->tcomm = connection->send ? &ncclTransports[connection->transport].send : &ncclTransports[connection->transport].recv; + // If we need proxy progress, let's allocate ops and start the thread + if (connection->tcomm->proxyProgress) { + NCCLCHECK(proxyProgressInit(comm)); + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1)); + } + INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport); + return ncclSuccess; +} + +static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) { + struct ncclSocket* sock = &peer->sock; + struct ncclProxyConnection* connection; + NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(void*))); + int reqSize, respSize; + NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int))); + NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int))); + if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError; + int nChannels; + NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int))); + if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels)); + return ncclSuccess; +} + +static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount) { + int done = 1; + if (op->type == ncclProxyMsgSetup) { + NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); + } else if (op->type == ncclProxyMsgConnect) { + NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); + } else return ncclInternalError; + if (done) { + if (op->respSize) NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize)); + if (op->reqBuff) free(op->reqBuff); + if (op->respBuff) free(op->respBuff); + op->reqBuff = NULL; + op->respBuff = NULL; + op->type = 0; + (*asyncOpCount)--; + } + return ncclSuccess; +} + +static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) { + struct ncclSocket* sock = &peer->sock; + struct ncclProxyAsyncOp* asyncOp = &peer->asyncOps; + asyncOp->type = type; + NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*))); + + NCCLCHECK(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int))); + NCCLCHECK(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int))); + if (asyncOp->reqSize) { + NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize)); + NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize)); + } + if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize)); + (*asyncOpCount)++; + NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount)); + return ncclSuccess; +} + +#include + +void* ncclProxyService(void* _args) { + struct ncclComm* comm = (struct ncclComm *) _args; + if (hipSetDevice(comm->cudaDev) != hipSuccess) { + WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev); + } + if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + + // Prepare poll descriptor + struct ncclProxyConnectionPool connectionPool; + connectionPool.pools = NULL; + connectionPool.banks = 0; + connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE; + + struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1]; + struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS]; + for (int s=0; sproxyState.listenSock->fd; + pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN; + + int maxnpeers = 0; + int npeers = 0; + int stop = 0; + int asyncOpCount = 0; + while (stop == 0 || (stop == 1 && npeers > 0)) { + if (int error = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : -1) < 0) { + WARN("[Proxy Service] Poll failed with error %d", error); + return NULL; + } + if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) { + int s = 0; + while (s < NCCL_MAX_LOCAL_RANKS && peers[s].sock.fd != -1) s++; + if (s == NCCL_MAX_LOCAL_RANKS) { + WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS); + return NULL; + } + if (maxnpeers < s+1) maxnpeers = s+1; + struct ncclSocket* sock = &peers[s].sock; + if (ncclSocketAccept(sock, comm->proxyState.listenSock) != ncclSuccess) { + WARN("[Service thread] Accept failed %s", strerror(errno)); + } else { + pollfds[s].fd = sock->fd; + npeers++; + peers[s].localRank = -1; + } + } + for (int s=0; ssock; + struct ncclProxyAsyncOp* op = &peer->asyncOps; + int closeConn = 0; + int type = 0; + ncclResult_t res = ncclSuccess; + if (op->type != 0) { + res = proxyProgressAsync(op, comm, &asyncOpCount); + type = op->type; + if (res != ncclSuccess) op->type = 0; + } else if (pollfds[s].revents & POLLIN) { + int closed; + if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) { + WARN("[Service thread] Could not receive type from localRank %d", peer->localRank); + closeConn = 1; + } else if (closed) { + INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank); + closeConn = 1; + } else { + if (type == ncclProxyMsgAbort) { + stop = 2; + closeConn = 1; + } else if (type == ncclProxyMsgStop) { + stop = 1; + closeConn = 1; + } else if (type == ncclProxyMsgClose) { + closeConn = 1; + } else if (type == ncclProxyMsgInit) { + res = proxyConnInit(peers+s, &connectionPool, comm); + } else if (type == ncclProxyMsgSharedInit) { + res = proxyConnSharedInit(peers+s, &connectionPool, comm); + } else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) { + res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount); + } else { + WARN("[Service thread] Unknown command %d from localRank %d\n", type, peer->localRank); + closeConn = 1; + } + } + } else if (pollfds[s].revents & POLLHUP) { + closeConn = 1; + } + if (res != ncclSuccess) { + WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res); + closeConn = 1; + } + if (closeConn) { + close(sock->fd); + sock->fd = pollfds[s].fd = -1; + npeers--; + } + } + } + // Wait for all operations to complete and stop progress thread before freeing any resource + if (ncclProxyProgressDestroy(comm) != ncclSuccess) { + WARN("[Proxy Service] proxyDestroy failed"); + } + for (int s=0; sproxyState.listenSock->fd); + free(comm->proxyState.listenSock); + proxyOpsFree(comm); + return NULL; +} + +ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) { + comm->proxyState.listenSock = sock; + comm->proxyState.peerAddresses = peerAddresses; + ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev); return ncclSuccess; } ncclResult_t ncclProxyCreate(struct ncclComm* comm) { - if (!comm->proxyThread) { - comm->proxyState.cond = PTHREAD_COND_INITIALIZER; - comm->proxyState.opsMutex = PTHREAD_MUTEX_INITIALIZER; - comm->proxyState.poolMutex = PTHREAD_MUTEX_INITIALIZER; - comm->proxyState.ops = NULL; - pthread_create(&comm->proxyThread, NULL, persistentThread, comm); - } + // comm->proxyState.thread is pthread_join()'d by commFree() in init.cc + pthread_create(&comm->proxyState.thread, NULL, ncclProxyService, comm); return ncclSuccess; } ncclResult_t ncclProxyDestroy(struct ncclComm* comm) { struct ncclProxyState* state = &comm->proxyState; - - // Request the proxy to stop and then wake it - pthread_mutex_lock(&state->opsMutex); - state->stop = true; - pthread_cond_signal(&state->cond); - pthread_mutex_unlock(&state->opsMutex); - if (comm->proxyThread) pthread_join(comm->proxyThread, NULL); - - // Free off any memory allocated for the proxy arg pools - pthread_mutex_lock(&state->poolMutex); - struct ncclProxyState* proxyState = &comm->proxyState; - while (proxyState->pools != NULL) { - struct ncclProxyPool *next = proxyState->pools->next; - free(proxyState->pools); - proxyState->pools = next; + if (state->peerAddresses) { + struct ncclSocket sock; + sock.abortFlag = NULL; + sock.asyncFlag = 0; + memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketConnect(&sock)); + int type = (*comm->abortFlag) ? ncclProxyMsgAbort : ncclProxyMsgStop; + NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int))); + close(sock.fd); + free(state->peerAddresses); + } + if (state->peerSocks) { + for (int i=0; ilocalRanks; i++) { + if (state->peerSocks[i].fd != -1) { + if (state->proxyOps[i].pool) { + NCCLCHECK(ncclShmClose(state->proxyOps[i].pool, NULL, sizeof(struct ncclProxyOpsPool))); + } + if (state->sharedDevMems[i]) { + CUDACHECK(hipIpcCloseMemHandle(state->sharedDevMems[i])); + } + int type = ncclProxyMsgClose; + if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks+i, &type, sizeof(int))); + close(state->peerSocks[i].fd); + } + } + free(state->peerSocks); + free(state->proxyOps); + free(state->sharedDevMems); } - pthread_mutex_unlock(&state->poolMutex); - - NCCLCHECK(ncclProxySharedBuffersDestroy(comm)); - return ncclSuccess; } diff --git a/src/transport.cc b/src/transport.cc index 62940498cb..6b279cbdbb 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,15 +8,19 @@ #include "comm.h" #include "info.h" #include "bootstrap.h" +#define ENABLE_TIMER 0 +#include "timer.h" extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; extern struct ncclTransport netTransport; +extern struct ncclTransport collNetTransport; struct ncclTransport ncclTransports[NTRANSPORTS] = { p2pTransport, shmTransport, netTransport, + collNetTransport }; template @@ -25,14 +29,12 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* struct ncclPeerInfo* peerInfo = comm->peerInfo+peer; struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex : comm->channels[channelId].peers[peer].recv + connIndex; - // handle intra-node network connections int n1 = -1, n2 = -1; if (connIndex == NCCL_CONN_IDX_P2P_NET) { NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, comm->rank, graph, channelId, (type == 1) ? 1 : 0, &n1)); NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, peer, graph, channelId, (type == 1) ? 0 : 1, &n2)); } - bool xgmi; NCCLCHECK(ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &xgmi)); for (int t=0; t= 11030 // Stream used during transport setup; need for P2P pre-connect + CUDA Graph hipStream_t transportSetupStream; CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking)); -#endif int highestType = TRANSPORT_P2P; // track highest transport type struct ncclConnect data[2*MAXCHANNELS]; @@ -97,12 +97,15 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* struct ncclConnect* recvData = data; int sendChannels = 0, recvChannels = 0; int type; + TIME_START(0); for (int c=0; c(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type)); if (type > highestType) highestType = type; } } + TIME_STOP(0); + TIME_START(1); struct ncclConnect* sendData = recvData+recvChannels; for (int c=0; c highestType) highestType = type; } } + TIME_STOP(1); + TIME_START(2); if (sendPeer == recvPeer) { if (recvChannels+sendChannels) { NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); @@ -124,38 +129,34 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels)); if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels)); } + TIME_STOP(2); + TIME_START(3); for (int c=0; cchannels[c].peers[sendPeer].send + connIndex; NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn)); conn->connected = 1; -#if CUDART_VERSION >= 11030 CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream)); -#else - CUDACHECK(hipMemcpy(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); -#endif } } + TIME_STOP(3); + TIME_START(4); for (int c=0; cchannels[c].peers[recvPeer].recv + connIndex; NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn)); conn->connected = 1; -#if CUDART_VERSION >= 11030 CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream)); -#else - CUDACHECK(hipMemcpy(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); -#endif } } + TIME_STOP(4); comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0; } -#if CUDART_VERSION >= 11030 CUDACHECK(hipStreamSynchronize(transportSetupStream)); CUDACHECK(hipStreamDestroy(transportSetupStream)); -#endif if (highestTransportType != NULL) *highestTransportType = highestType; + TIME_PRINT("P2P Setup/Connect"); return ncclSuccess; } @@ -250,9 +251,9 @@ cleanup: ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) { // AllGather collNet setup results - int allGatherFailures[NCCL_MAX_INTRA_RANKS] = {0}; - allGatherFailures[comm->intraNodeRank] = collNetSetupFail; - NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, allGatherFailures, sizeof(int))); + int allGatherFailures[NCCL_MAX_LOCAL_RANKS] = {0}; + allGatherFailures[comm->localRank] = collNetSetupFail; + NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, allGatherFailures, sizeof(int))); for (int i=0; ilocalRanks; i++) { if (allGatherFailures[i] != 0) { collNetSetupFail = 1; @@ -260,7 +261,7 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa } } if (collNetSetupFail) { - if (comm->intraNodeRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead"); + if (comm->localRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead"); return ncclSystemError; } return ncclSuccess; @@ -273,12 +274,12 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) { struct ncclPeer* peer = channel->peers+comm->nRanks; for (int b=0; bsend + b; - if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources)); + if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send)); send->transportResources = NULL; // avoid double free } for (int b=0; brecv + b; - if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources)); + if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv)); recv->transportResources = NULL; // avoid double free } } diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index 9f9d9b5dd1..01f3ee6807 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,11 +8,15 @@ #include "comm.h" #include "coll_net.h" #include "graph.h" +#include "proxy.h" +#include "gdrwrap.h" -#define COLLNET_GROUP_NSUBS 8 -#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS) +int64_t ncclParamGdrCopySyncEnable(); +int64_t ncclParamGdrCopyFlushEnable(); struct collNetRecvConnectInfo { + int rank; + int nranks; collNetHandle_t collNetHandle; }; @@ -21,132 +25,287 @@ struct collNetSendConnectInfo { void* reqFifo; }; +#define COLLNET_GROUP_NSUBS 8 +#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS) + +#define NCCL_NET_MAP_HOSTMEM 0 +#define NCCL_NET_MAP_DEVMEM 1 +#define NCCL_NET_MAP_SHARED_HOSTMEM 2 +#define NCCL_NET_MAP_SHARED_DEVMEM 3 +#define NCCL_NET_MAP_GDCMEM 4 +#define NCCL_NET_MAP_MEMS 5 + +#define NCCL_NET_MAP_MASK_DEVMEM 0x40000000 +#define NCCL_NET_MAP_MASK_SHARED 0x80000000 +#define NCCL_NET_MAP_MASK_USED 0x20000000 +#define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff + +#define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \ + ((mapStruct)->offsets.offsetName >> 30) + +#define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \ + (((mapStruct)->offsets.offsetName >> 29) == 0) + +#define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \ + (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \ + (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET)) + +#define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \ + (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0) + +#define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \ + int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \ + if ((shared) == 0) { \ + if (dev) { \ + (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \ + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \ + } else { \ + (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \ + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \ + } \ + } else { \ + (mapStruct)->offsets.offsetName = bank; \ + } \ +} while (0); + +struct connectMapMem{ + char* gpuPtr; + char* cpuPtr; + int size; +}; + +struct connectMap { + int shared; + // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem. + struct connectMapMem mems[NCCL_NET_MAP_MEMS]; + // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL. + struct { + uint32_t sendMem; + uint32_t recvMem; + uint32_t buffs[NCCL_NUM_PROTOCOLS]; + } offsets; +}; + struct reqSlot { volatile void* recvBuff; volatile int size; }; -struct collNetSendResources { - struct ncclComm* comm; +struct sendResources { + struct connectMap map; void* collNetComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; + + int rank; + int nranks; int netDev; int useGdr; + uint64_t* gdcSync; + void* gdrDesc; void* sendMhandles[NCCL_NUM_PROTOCOLS]; void* recvMhandles[NCCL_NUM_PROTOCOLS]; - struct ncclRecvMem* devRecvMem; uint64_t step; - uint64_t llLastCleaning; struct reqSlot (*reqFifo)[NCCL_STEPS]; int collNetRank; uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only) }; -struct collNetRecvResources { - struct ncclComm* comm; +struct recvResources { + struct connectMap map; void* collNetComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; + + int rank; + int nranks; int netDev; int useGdr; + uint64_t* gdcSync; + uint64_t* gdcFlush; + void* gdrDesc; void* mhandles[NCCL_NUM_PROTOCOLS]; - struct ncclRecvMem* devRecvMem; uint64_t step; - uint64_t llLastCleaning; struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS]; int collNetRank; uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only) }; -struct collNetSharedResources { - void* collNetListenComms[MAXCHANNELS]; - void* collNetComms[MAXCHANNELS]; - int collNetCommRefCount[MAXCHANNELS]; -}; - /* Determine if we can communicate with the peer */ -ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 1; return ncclSuccess; } -ncclResult_t collNetSharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) { - struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources; +struct setupReq { + int netDev; + int useGdr; +}; + + +/* Setup send connector, and return connect information for others in the coll + * communicator to connect to me */ +static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { + struct setupReq req; + + int proxyRank; + NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); + send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0; + + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn)); + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); + + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev, + req.useGdr ? "/GDRDMA" : ""); + return ncclSuccess; +} + +static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { + struct setupReq req; + + int proxyRank; + NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr)); + recv->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0; + + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn)); + struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; + NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t))); + + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev, + req.useGdr ? "/GDRDMA" : ""); + return ncclSuccess; +} + +static ncclResult_t collNetDumpMap(struct connectMap* map) { + printf("Dump map\n"); + struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM; + printf("Mem 0: Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); + mem = map->mems+NCCL_NET_MAP_DEVMEM; + printf("Mem 1: Vid mem CPU (%x B) %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); + mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM; + printf("Mem 2: Shared Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); + mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM; + printf("Mem 3: Shared Vid (%x B) mem CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); + printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", + map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, + NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET, + NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem)); + printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", + map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, + NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET, + NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem)); + for (int p=0; p Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p, + map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0, + NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET, + NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p])); + } + printf("End of dump\n"); + return ncclSuccess; +} + +struct collNetConnectArgs { + int rank; + int nranks; + struct ncclConnect* connectInfos; +}; + +static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) { + // We're on the same process as the proxy. We can pass a pointer to a struct. + struct collNetConnectArgs args = { rank, nranks, connectInfos }; + struct connectMap* map; + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); + + // If collnet connect failed, propagate error to fallback on regular p2p + if (map == NULL) return ncclSystemError; + + //NCCLCHECK(collNetDumpMap(map)); + + struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); + void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; + send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head; + + struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); + send->conn.tail = &recvMem->tail; + send->conn.sizesFifo = recvMem->sizesFifo; + for (int i=0; iconn.sizesFifo[i] = -1; + send->conn.offsFifo = recvMem->offsFifo; + + for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); + return ncclSuccess; +} + +static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) { + // We're on the same process as the proxy. We can pass a pointer to a struct. + struct collNetConnectArgs args = { rank, nranks, connectInfos }; + struct connectMap* map; + NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); + + // If collnet connect failed, propagate error to fallback on regular p2p + if (map == NULL) return ncclSystemError; + + //NCCLCHECK(collNetDumpMap(map)); + + struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); + recv->conn.head = &sendMem->head; + + struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); + void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; + recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail; + recv->conn.offsFifo = recvMem->offsFifo; + + for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); + } + return ncclSuccess; +} + +static ncclResult_t sendFree(struct ncclConnector* send) { + return ncclSuccess; +} + +static ncclResult_t recvFree(struct ncclConnector* recv) { + return ncclSuccess; +} + +static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct setupReq* req = (struct setupReq*)reqBuff; + if (reqSize != sizeof(struct setupReq)) return ncclInternalError; + + struct sendResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + connection->transportResources = resources; + connection->shared = 1; + + resources->netDev = req->netDev; + resources->useGdr = req->useGdr; + return ncclSuccess; +} + +struct sharedResources { + void* collNetListenComms[MAXCHANNELS]; + void* collNetComms[MAXCHANNELS]; + int commRefCount[NCCL_MAX_NETDEVS]; +}; + +ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) { + struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources; if (resources == NULL) { NCCLCHECK(ncclCalloc(&resources, 1)); - comm->proxyState.sharedBuffs.collNetResources = resources; + comm->proxyState.progressState.collNet.resources = resources; } if (resources->collNetComms[netDev] == NULL) NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev)); return ncclSuccess; } -/* Setup send connector, and return connect information for others in the coll communicator to connect to me */ -ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { - struct collNetSendResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - send->transportResources = resources; - send->conn.shared = 1; - resources->comm = comm; - - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev)); - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr)); - - send->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev+1; - - NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); - - int recvSize = offsetof(struct ncclRecvMem, buff); - // Simple uses shared buffers and we don't support LL128 - recvSize += send->comm->buffSizes[NCCL_PROTO_LL]; - - if (resources->useGdr) { - NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, resources->useGdr)); - CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev)); - send->conn.curr_hdp_reg = resources->curr_hdp_reg; - } - NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize)); - - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : "", comm, comm->nRanks); - return ncclSuccess; -} - -/* Setup recv connector */ -ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { - struct collNetRecvResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - recv->transportResources = resources; - recv->conn.shared = 1; - resources->comm = comm; - - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev)); - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr)); - - recv->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev; - - NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); - - int recvSize = offsetof(struct ncclRecvMem, buff); - // Simple uses shared buffers and we don't support LL128 - recvSize += recv->comm->buffSizes[NCCL_PROTO_LL]; - - if (resources->useGdr) { - NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, resources->useGdr)); - } - NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize)); - - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : "", comm, comm->nRanks); - struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; - - NCCLCHECK(collNetSharedListen(comm, resources->netDev, &info->collNetHandle)); - return ncclSuccess; -} - -ncclResult_t collNetSharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) { - struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources; +static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) { + struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources; if (resources->collNetComms[netDev] == NULL) { // Connect to coll comm collNetHandle_t** handlePtrs = NULL; @@ -159,157 +318,255 @@ ncclResult_t collNetSharedConnect(struct ncclComm* comm, int netDev, struct nccl resources->collNetListenComms[netDev], resources->collNetComms+netDev); free(handlePtrs); - NCCLCHECK(ret); - // Close listen comm - NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev])); + if (ret == ncclSuccess) { + // Close listen comm + NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev])); + } else { + resources->collNetListenComms[netDev] = NULL; + } } *collNetComm = resources->collNetComms[netDev]; - resources->collNetCommRefCount[netDev]++; + if (*collNetComm) resources->commRefCount[netDev]++; return ncclSuccess; } -ncclResult_t collNetSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) { - // Setup device pointers - struct collNetSendResources* resources = (struct collNetSendResources*)send->transportResources; - struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank); +static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) { + struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources; + resources->commRefCount[netDev]--; + if (resources->commRefCount[netDev] == 0) { + NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev])); + } + for (int n=0; ncommRefCount[n]) return ncclSuccess; + comm->proxyState.progressState.collNet.resources = NULL; + free(resources); + return ncclSuccess; +} - // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host - send->conn.buffs[NCCL_PROTO_LL] = resources->recvMem->buff; - send->conn.buffs[NCCL_PROTO_LL128] = send->conn.buffs[NCCL_PROTO_SIMPLE] = NULL; - send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; +static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, char** gpuPtr, char** cpuPtr, int* size) { + struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet; + if (state->size == 0) { + state->size = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE]; + } - // Head/Tail/Opcount/Fifos are always on host - send->conn.tail = &resources->recvMem->tail; - send->conn.sizesFifo = resources->recvMem->sizesFifo; - send->conn.ptrsFifo = resources->recvMem->ptrsFifo; - send->conn.head = &resources->sendMem->head; - resources->sendMem->head = -NCCL_STEPS; // Don't give any credit yet when sharing buffers - for (int i=0; iconn.sizesFifo[i] = -1; + *size = state->size; + + if (cuda && state->cudaBuff == NULL) { + NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size, cuda)); + } + if (!cuda && state->hostBuff == NULL) { + NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size)); + } + *gpuPtr = *cpuPtr = cuda ? state->cudaBuff : state->hostBuff; + return ncclSuccess; +} + +static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int type, int slot, int channel, int* offset) { + // Use different pools for different channels and also separate send/recv. + int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; + int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel; + *offset = slotSize * globalSlot; + return ncclSuccess; +} + +static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm) { + struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet; + if (state->size == 0) return ncclSuccess; + CUDACHECK(hipFree(state->cudaBuff)); + NCCLCHECK(ncclCudaHostFree(state->hostBuff)); + // This will be called multiple times, with multiple channels and send/recv. Make sure we only do it once. + state->size = 0; + return ncclSuccess; +} + +static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct setupReq* req = (struct setupReq*)reqBuff; + if (reqSize != sizeof (struct setupReq)) return ncclInternalError; + + struct recvResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + connection->transportResources = resources; + connection->shared = 1; + + resources->netDev = req->netDev; + resources->useGdr = req->useGdr; + + collNetHandle_t* netHandle = (collNetHandle_t*) respBuff; + if (respSize != sizeof(collNetHandle_t)) return ncclInternalError; + + NCCLCHECK(sharedListen(comm, req->netDev, netHandle)); + return ncclSuccess; +} + +static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } + struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; + struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank); + + struct sendResources* resources = (struct sendResources*)(connection->transportResources); // Get info from recv side - resources->collNetRank = rank; + resources->collNetRank = args->rank; resources->reqFifo = (struct reqSlot (*)[NCCL_STEPS])(info->reqFifo); for (int p=0; precvMhandles[p] = info->mhandles[p]; - NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm)); + NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm)); + + // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller. + if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; } + if (resources->collNetComm == NULL) { + *((struct connectMap**)respBuff) = NULL; + return ncclSuccess; + } + connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev; + + struct connectMap* map = &resources->map; + + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); + + NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); + map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; + if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) { + uint64_t *cpuPtr, *gpuPtr; + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc)); + + resources->gdcSync = cpuPtr; + struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; + gdcMem->cpuPtr = (char*)cpuPtr; + gdcMem->gpuPtr = (char*)gpuPtr; + gdcMem->size = sizeof(uint64_t); // sendMem->head + } + + resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); + resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); + // Don't give credits yet in shared mode. + resources->sendMem->head = -NCCL_STEPS; - int size; - char* ptr; // Allocate & Register shared buffers for the Simple protocol - NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, &size, &ptr)); - NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size, + int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; + struct connectMapMem* mapMem = map->mems+bank; + NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + + NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size, resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_SIMPLE])); - // Allocate & Register shared buffers for the LL protocol - NCCLCHECK(ncclProxySharedBuffersInit(send->comm, 0, &size, &ptr)); - NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size, - NCCL_PTR_HOST, - &resources->sendMhandles[NCCL_PROTO_LL])); + *((struct connectMap**)respBuff) = &resources->map; return ncclSuccess; } -ncclResult_t collNetRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) { - // Setup device pointers - struct collNetRecvResources* resources = (struct collNetRecvResources*)recv->transportResources; - struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank); - resources->collNetRank = rank; +static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } + struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; - // Intermediate buffering on GPU for GPU Direct RDMA - struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem; - int offset = 0; - for (int p=0; pconn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset; - offset += recv->comm->buffSizes[p]; + struct recvResources* resources = (struct recvResources*)(connection->transportResources); + struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank); + resources->collNetRank = args->rank; + + NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm)); + + // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller. + if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; } + if (resources->collNetComm == NULL) { + *((struct connectMap**)respBuff) = NULL; + return ncclSuccess; } - recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; + connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev+1; - // Head/Tail/Opcount are always on host - recv->conn.tail = &resources->recvMem->tail; - recv->conn.ptrsFifo = resources->recvMem->ptrsFifo; - recv->conn.head = &resources->sendMem->head; + struct connectMap* map = &resources->map; - NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm)); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); - int size; - char* ptr; + NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); + map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; + if (ncclGdrCopy) { + uint64_t *cpuPtr, *gpuPtr; + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc)); + + if (ncclParamGdrCopySyncEnable()) { + resources->gdcSync = cpuPtr; + struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; + gdcMem->cpuPtr = (char*)cpuPtr; + gdcMem->gpuPtr = (char*)gpuPtr; + gdcMem->size = sizeof(uint64_t); + } + if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1; + } + + resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); + resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); // Allocate & Register shared buffers for the Simple protocol - NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, &size, &ptr)); - NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size, + int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; + struct connectMapMem* mapMem = map->mems+bank; + NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + + NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size, resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_SIMPLE])); - // Allocate & Register shared buffers for the LL protocol - NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, 0, &size, &ptr)); - NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size, - NCCL_PTR_HOST, - &resources->mhandles[NCCL_PROTO_LL])); - // Pass info to send side info->reqFifo = resources->reqFifo; for (int p=0; pmhandles[p] = resources->mhandles[p]; + if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; } + *((struct connectMap**)respBuff) = &resources->map; return ncclSuccess; } -ncclResult_t collNetSharedFree(struct ncclComm* comm, int netDev) { - struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources; - resources->collNetCommRefCount[netDev]--; - if (resources->collNetCommRefCount[netDev] == 0) { - NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev])); +static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + struct sendResources* resources = (struct sendResources*)(connection->transportResources); + for (int p=0; psendMhandles[p]) { + NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[p])); + } } - for (int c=0; ccollNetCommRefCount[c]) return ncclSuccess; - comm->proxyState.sharedBuffs.collNetResources = NULL; - free(resources); + struct connectMapMem* mems = resources->map.mems; + NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); + CUDACHECK(hipFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); + NCCLCHECK(sharedBuffersDestroy(comm)); + NCCLCHECK(sharedFree(comm, resources->netDev)); + free(connection->transportResources); return ncclSuccess; } -ncclResult_t collNetSendFree(void* sendTransportResources) { - struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources; - NCCLCHECK(ncclCudaHostFree(resources->sendMem)); - NCCLCHECK(ncclCudaHostFree(resources->recvMem)); - if (resources->collNetComm) { - NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_LL])); - NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_SIMPLE])); +static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + struct recvResources* resources = (struct recvResources*)(connection->transportResources); + for (int p=0; pmhandles[p]) { + NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[p])); + } } - if (resources->useGdr) CUDACHECK(hipFree(resources->devRecvMem)); - - NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev)); - free(resources); + struct connectMapMem* mems = resources->map.mems; + NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); + CUDACHECK(hipFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); + NCCLCHECK(sharedBuffersDestroy(comm)); + NCCLCHECK(sharedFree(comm, resources->netDev)); + free(connection->transportResources); return ncclSuccess; } -ncclResult_t collNetRecvFree(void* recvTransportResources) { - struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources; - NCCLCHECK(ncclCudaHostFree(resources->sendMem)); - NCCLCHECK(ncclCudaHostFree(resources->recvMem)); - if (resources->collNetComm) { - NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_LL])); - NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_SIMPLE])); - } - if (resources->useGdr) CUDACHECK(hipFree(resources->devRecvMem)); - - NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev)); - free(resources); - return ncclSuccess; -} #define LAST_OF_GROUP(s) \ (s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1) -ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { - if (args->protocol == NCCL_PROTO_LL128) { - WARN("CollNet does not support LL128"); +static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { + if (args->protocol != NCCL_PROTO_SIMPLE) { + WARN("CollNet does not support LL/LL128"); return ncclInternalError; } if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources); + struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->transmitted = sub->done = 0; @@ -325,23 +582,21 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { int perGroupSteps = NCCL_STEPS / nGroups; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources); + struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); void* sendMhandle = resources->sendMhandles[p]; void* recvMhandle = resources->recvMhandles[p]; - int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS; auto reqFifo = resources->reqFifo; if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) { int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; - if (p == NCCL_PROTO_SIMPLE) { - char* ptr; - int sharedBuffSlot = sub->posted%NCCL_STEPS; - NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, resources->useGdr, 0, sharedBuffSlot, 0, &ptr)); - resources->recvMem->ptrsFifo[buffSlot] = ptr + s*args->chunkSize; - __sync_synchronize(); - } - volatile uint64_t* sendHead = &resources->sendMem->head; + int sharedBuffSlot = sub->posted%NCCL_STEPS; + int offset; + NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset)); + resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize; + __sync_synchronize(); + volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; sub->posted += args->sliceSteps; *sendHead = sub->base + sub->posted - NCCL_STEPS; + if (resources->gdcSync) wc_store_fence(); // Flush out WC write } // Enforce sync between operations of the same group. bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->received == sub->received)) || (s && (sub-1)->received > sub->received)); @@ -350,30 +605,15 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { int sharedBuffSlot = sub->received%NCCL_STEPS; volatile int* sizesFifo = resources->recvMem->sizesFifo; volatile uint64_t* recvTail = &resources->recvMem->tail; - if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)) || p == NCCL_PROTO_LL)) { + char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]); + if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)))) { // We have something to receive, let's check whether data is ready. - int size = sizesFifo[buffSlot]; int ready = 1; if (s == 0) { - NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 0, sharedBuffSlot, 0, &args->sharedBuff[sharedBuffSlot])); - args->sharedSize[sharedBuffSlot] = p == NCCL_PROTO_SIMPLE ? args->chunkSize : size/2; - } - if (p == NCCL_PROTO_LL) { - char* localBuff = sub->connector->conn.buffs[p]; - uint32_t flag = NCCL_LL_FLAG(sub->base + sub->received + 1); - int nFifoLines = size / sizeof(union ncclLLFifoLine); - union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize); - // Pack data into the shared buffer - uint32_t* sendBuff = (uint32_t*)(args->sharedBuff[sharedBuffSlot]+args->sharedSize[sharedBuffSlot]*s); - for (int i=0; isharedBuff[sharedBuffSlot] = localBuff + offset; + args->sharedSize[sharedBuffSlot] = args->chunkSize; } if (ready) { sizesFifo[buffSlot] = -1; @@ -437,15 +677,15 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { return ncclSuccess; } -ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { - if (args->protocol == NCCL_PROTO_LL128) { - WARN("CollNet does not support LL128"); +static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { + if (args->protocol != NCCL_PROTO_SIMPLE) { + WARN("CollNet does not support LL/LL128"); return ncclInternalError; } if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources); + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0; @@ -460,19 +700,20 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { int perGroupSteps = NCCL_STEPS / nGroups; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources); + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); void* mhandle = resources->mhandles[p]; - int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS; auto reqFifo = resources->reqFifo; + char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); + // Enforce sync between operations of the same group. if (LAST_OF_GROUP(s) && (sub->posted < sub->done + perGroupSteps) && (sub->posted < sub->nsteps)) { int group = s / COLLNET_GROUP_NSUBS; int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; - char* ptr; int sharedBuffSlot = sub->posted%NCCL_STEPS; int startChannel = group*COLLNET_GROUP_NSUBS; - NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 1, sharedBuffSlot, startChannel, &ptr)); - reqFifo[group][buffSlot].recvBuff = ptr; + int offset; + NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset)); + reqFifo[group][buffSlot].recvBuff = localBuff + offset; TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff); sub->posted += args->sliceSteps; args->idle = 0; @@ -487,11 +728,24 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { int totalSize = args->sharedSize[sharedBuffSlot]*(s-group*COLLNET_GROUP_NSUBS+1); TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize); sub->received += args->sliceSteps; - if (reqFifo[group][buffSlot].size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) { - int startChannel = group*COLLNET_GROUP_NSUBS; - char* groupRecvAddress; - NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, 1, 1, sharedBuffSlot, startChannel, &groupRecvAddress)); - NCCLCHECK(collNetIflush(resources->collNetComm, groupRecvAddress, totalSize, mhandle, sub->requests+buffSlot)); + sub->requests[buffSlot] = NULL; + if (reqFifo[group][buffSlot].size > 0 && resources->useGdr) { + // GDRCOPY support + if (resources->gdcFlush) { +#if defined (__x86_64__) + // Force a PCI-E read from GPU memory + asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax"); +#else + WARN("NET: GDR Flush only supported on x86_64"); + return ncclInternalError; +#endif + sub->requests[buffSlot] = NULL; + } else { + int startChannel = group*COLLNET_GROUP_NSUBS; + int offset; + NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset)); + NCCLCHECK(collNetIflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot)); + } } else { for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps; } @@ -517,27 +771,14 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS; int sharedBuffSlot = sub->transmitted%NCCL_STEPS; int startChannel = group*COLLNET_GROUP_NSUBS; - char* groupRecvAddress; - NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 1, sharedBuffSlot, startChannel, &groupRecvAddress)); - char* ptr = groupRecvAddress + (s%COLLNET_GROUP_NSUBS)*args->sharedSize[sharedBuffSlot]; - if (p == NCCL_PROTO_SIMPLE) { - volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo; - ptrsFifo[buffSlot] = ptr; - __sync_synchronize(); - resources->recvMem->tail = sub->base + sub->flushed; - } - if (p == NCCL_PROTO_LL) { // ll - // re-attach flag - char* localBuff = sub->connector->conn.buffs[p]; - uint32_t flag = NCCL_LL_FLAG(sub->base + sub->transmitted + 1); - union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize); - uint32_t* recvData = (uint32_t*)ptr; - int nFifoLines = DIVUP(args->sharedSize[sharedBuffSlot], 2*sizeof(uint32_t)); - for (int i=0; irecvMem->offsFifo; + offsFifo[buffSlot] = offset + (s%COLLNET_GROUP_NSUBS)*args->chunkSize; + __sync_synchronize(); + volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail; + *recvTail = sub->base + sub->flushed; + if (resources->gdcSync) wc_store_fence(); // Flush out WC write sub->transmitted += args->sliceSteps; args->idle = 0; continue; @@ -562,7 +803,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { struct ncclTransport collNetTransport = { "COL", - collNetCanConnect, - { collNetSendSetup, collNetSendConnect, collNetSendFree, collNetSendProxy }, - { collNetRecvSetup, collNetRecvConnect, collNetRecvFree, collNetRecvProxy } + canConnect, + { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress }, + { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress } }; diff --git a/src/transport/net.cc b/src/transport/net.cc index be459840ca..b7b8b753b4 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,62 +9,135 @@ #include "net.h" #include "graph.h" #include +#include "proxy.h" #include "collectives.h" -#include #include "gdrwrap.h" +#include "shm.h" +#include "profiler.h" -struct netConnectInfo { - ncclNetHandle_t netHandle; +static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large"); + +#define NCCL_NET_MAP_HOSTMEM 0 +#define NCCL_NET_MAP_DEVMEM 1 +#define NCCL_NET_MAP_SHARED_HOSTMEM 2 +#define NCCL_NET_MAP_SHARED_DEVMEM 3 +#define NCCL_NET_MAP_GDCMEM 4 +#define NCCL_NET_MAP_MEMS 5 + +#define NCCL_NET_MAP_MASK_DEVMEM 0x40000000 +#define NCCL_NET_MAP_MASK_SHARED 0x80000000 +#define NCCL_NET_MAP_MASK_USED 0x20000000 +#define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff + +#define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \ + ((mapStruct)->offsets.offsetName >> 30) + +#define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \ + (((mapStruct)->offsets.offsetName >> 29) == 0) + +#define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \ + (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \ + (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET)) + +#define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \ + (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0) + +#define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \ + int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \ + if ((shared) == 0) { \ + if (dev) { \ + (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \ + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \ + } else { \ + (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \ + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \ + } \ + } else { \ + (mapStruct)->offsets.offsetName = bank; \ + } \ +} while (0); + +struct connectMapMem{ + char* gpuPtr; + char* cpuPtr; + int size; + union { + char shmPath[PATH_MAX]; + hipIpcMemHandle_t ipc; + }; }; -#define LOC_HOSTMEM 0 -#define LOC_DEVMEM 1 -#define LOC_COUNT 2 +struct connectMap { + int sameProcess; + int shared; + int cudaDev; + // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem. + struct connectMapMem mems[NCCL_NET_MAP_MEMS]; + // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL. + struct { + uint32_t sendMem; + uint32_t recvMem; + uint32_t buffs[NCCL_NUM_PROTOCOLS]; + } offsets; +}; -struct netSendResources { +struct sendResources { + struct connectMap map; void* netSendComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; + + int rank; + int localRank; + int remoteRank; int netDev; int useGdr; + int maxRecvs; + uint64_t* gdcSync; + void* gdrDesc; int shared; - char* buffers[LOC_COUNT]; - int buffSizes[LOC_COUNT]; - void* mhandles[LOC_COUNT]; - void** mhandlesProto[NCCL_NUM_PROTOCOLS]; + int channelId; + int connIndex; + char* buffers[NCCL_NUM_PROTOCOLS]; + int buffSizes[NCCL_NUM_PROTOCOLS]; + void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; uint64_t llLastCleaning; uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only) }; -struct netRecvResources { +struct recvResources { + struct connectMap map; void* netListenComm; void* netRecvComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; - // GDRCOPY support - void* gdrMemDesc; - struct ncclRecvMem* devRecvMem; - void* gdrFlushDesc; - int* devFlushMem; - + int rank; + int localRank; + int remoteRank; + int proxyRank; int netDev; int useGdr; + int maxRecvs; + uint64_t* gdcSync; + uint64_t* gdcFlush; + void* gdrDesc; int shared; - char* buffers[LOC_COUNT]; - int buffSizes[LOC_COUNT]; - void* mhandles[LOC_COUNT]; - void** mhandlesProto[NCCL_NUM_PROTOCOLS]; + int channelId; + int connIndex; + char* buffers[NCCL_NUM_PROTOCOLS]; + int buffSizes[NCCL_NUM_PROTOCOLS]; + void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; uint64_t llLastCleaning; uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only) }; -NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", -2); +NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0); /* Determine if two peers can communicate with NET */ -ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { // Same host? if (info1->hostHash == info2->hostHash) { // User disabled NET for intra-node? @@ -78,259 +151,641 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop } NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2); +NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1); + +struct setupReq { + int rank; + int localRank; + int remoteRank; + int shared; + int netDev; + int useGdr; + int channelId; + int connIndex; +}; /* Determine if we will use this transport for this peer and return connect * information for this peer */ -ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { - struct netSendResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - send->transportResources = resources; - send->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1; - send->proxyAppendPtr = send->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId+1 : &send->proxyAppend; +static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { + struct setupReq req; - resources->netDev = -1; - if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 1, &resources->netDev)); - if (resources->netDev < 0) { - // Send/Receive: Round-robin NICs based on the receiver's CUDA device - int nicRR = comm->peerInfo[peerInfo->rank].cudaDev; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev)); + send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; + req.channelId = channelId; + req.connIndex = connIndex; + req.netDev = -1; + + int proxyRank = myInfo->rank; + if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 1, &req.netDev)); + if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); + send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0; + + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn)); + req.rank = myInfo->rank; + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank)); + req.remoteRank = peerInfo->rank; + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); + + if (proxyRank == myInfo->rank) { + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev, + req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); + } else { + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev, + proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr)); - - NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); - NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1)); - - send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; - send->conn.tail = &resources->recvMem->tail; - send->conn.sizesFifo = resources->recvMem->sizesFifo; - // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree - send->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL; - send->conn.head = &resources->sendMem->head; - resources->sendMem->head = resources->shared ? -NCCL_STEPS : 0; // Don't give any credit yet when sharing buffers - for (int i=0; iconn.sizesFifo[i] = -1; - - if (resources->shared == 0) { - int protoLoc[NCCL_NUM_PROTOCOLS]; - for (int p=0; puseGdr ? LOC_DEVMEM : LOC_HOSTMEM; - } - int buffSizes[NCCL_NUM_PROTOCOLS]; - for (int p=0; pcomm->buffSizes[p]; - resources->buffSizes[protoLoc[p]] += buffSizes[p]; - } - - if (resources->buffSizes[LOC_DEVMEM]) { - NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr)); - } - if (resources->buffSizes[LOC_HOSTMEM]) { - NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM])); - } - - int offsets[LOC_COUNT]; - offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0; - for (int p=0; pmhandlesProto[p] = resources->mhandles+protoLoc[p]; - send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]]; - offsets[protoLoc[p]] += buffSizes[p]; - } - } - - if (resources->useGdr) { - CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev)); - send->conn.curr_hdp_reg = resources->curr_hdp_reg; - } - INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, - peerInfo->busId, ncclNetName(), resources->netDev,resources->useGdr ? "/GDRDMA" : "", - resources->shared ? "/Shared" : "", comm, comm->nRanks); + *((int*)connectInfo) = proxyRank; return ncclSuccess; } // GDRCOPY support: TAIL_ENABLE When enabled locates the RX proxy tail in CUDA memory -NCCL_PARAM(GdrCopyTailEnable, "GDRCOPY_TAIL_ENABLE", 1); +NCCL_PARAM(GdrCopySyncEnable, "GDRCOPY_SYNC_ENABLE", 1); // GDRCOPY support: FLUSH_ENABLE When enabled uses a PCI-E read to flush GDRDMA buffers NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0); -ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { - struct netRecvResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - recv->transportResources = resources; - recv->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1; - recv->proxyAppendPtr = recv->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId : &recv->proxyAppend; +/* Setup recv connector */ +static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { + struct setupReq req; - resources->netDev = -1; - if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev)); - if (resources->netDev < 0) { - // Send/Receive: Round-robin NICs based on the receiver's CUDA device - int nicRR = comm->cudaDev; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev)); - } - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr)); + recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; + req.channelId = channelId; + req.connIndex = connIndex; + req.netDev = -1; - NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); - NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1)); + // Use myInfo->rank as the receiver uses its own NIC + int proxyRank = myInfo->rank; + if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &req.netDev)); + if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr)); - // GDRCOPY tail support - if (ncclGdrCopy != NULL && ncclParamGdrCopyTailEnable() == 1) { - struct ncclRecvMem* devCudaPtr; - NCCLCHECK(ncclGdrCudaCalloc(&resources->devRecvMem, &devCudaPtr, 1, &resources->gdrMemDesc)); - // The GDR mapped VA doesn't work on the SMs - recv->conn.tail = &((struct ncclRecvMem*)devCudaPtr)->tail; - } else { - recv->conn.tail = &resources->recvMem->tail; - } + // We don't support PXN on receive yet + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn)); - // GDRCOPY flush support -#if defined (__x86_64__) - if (ncclGdrCopy != NULL && ncclParamGdrCopyFlushEnable() == 1) { - int* cudaPtr; - NCCLCHECK(ncclGdrCudaCalloc(&resources->devFlushMem, &cudaPtr, 1, &resources->gdrFlushDesc)); - } -#endif - - recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; - // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree - recv->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL; - recv->conn.head = &resources->sendMem->head; - - if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree not for p2p - int protoLoc[NCCL_NUM_PROTOCOLS]; - for (int p=0; puseGdr ? LOC_DEVMEM : LOC_HOSTMEM; - } - - int buffSizes[NCCL_NUM_PROTOCOLS]; - for (int p=0; pcomm->buffSizes[p]; - resources->buffSizes[protoLoc[p]] += buffSizes[p]; - } - - if (resources->buffSizes[LOC_DEVMEM]) { - NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr)); - } - if (resources->buffSizes[LOC_HOSTMEM]) { - NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM])); - } - - int offsets[LOC_COUNT]; - offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0; - for (int p=0; pmhandlesProto[p] = resources->mhandles+protoLoc[p]; - recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]]; - offsets[protoLoc[p]] += buffSizes[p]; - } - } - - INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, peerInfo->rank, - peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,resources->useGdr ? "/GDRDMA" : "", - resources->shared ? "/Shared" : "", comm, comm->nRanks); - struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; - NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm)); + req.rank = myInfo->rank; + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank)); + req.remoteRank = peerInfo->rank; + NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev, + req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); return ncclSuccess; } -ncclResult_t netSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { +static ncclResult_t netMapShm(struct connectMapMem* mem) { + NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, 0)); + NCCLCHECK(ncclShmUnlink(mem->shmPath)); + return ncclSuccess; +} +static ncclResult_t netCreateShm(struct connectMapMem* mem) { + mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file + NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1)); + return ncclSuccess; +} + +static ncclResult_t netDumpMap(struct connectMap* map) { + printf("Dump map same process %d shared %d\n", map->sameProcess, map->shared); + struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM; + printf("Mem 0: Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr); + mem = map->mems+NCCL_NET_MAP_DEVMEM; + printf("Mem 1: Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); + mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM; + printf("Mem 2: Shared Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr); + mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM; + printf("Mem 3: Shared Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); + printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", + map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, + NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET, + NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem)); + printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", + map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, + NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET, + NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem)); + for (int p=0; p Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p, + map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0, + NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET, + NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p])); + } + printf("End of dump\n"); + return ncclSuccess; +} + +static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { // Setup device pointers - struct netSendResources* resources = (struct netSendResources*)send->transportResources; - struct netConnectInfo* info = (struct netConnectInfo*)connectInfo; + struct connectMap* map; + NCCLCHECK(ncclCalloc(&map, 1)); + send->transportResources = map; + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), map, sizeof(struct connectMap))); - // Connect to remote peer - NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm)); + if (map->sameProcess) { + if (map->cudaDev != comm->cudaDev) { + // Enable P2P access + hipError_t err = hipDeviceEnablePeerAccess(map->cudaDev, 0); + if (err == hipErrorPeerAccessAlreadyEnabled) { + hipGetLastError(); + } else if (err != hipSuccess) { + WARN("failed to peer with device %d: %d %s", map->cudaDev, err, hipGetErrorString(err)); + return ncclInternalError; + } + } + } else { + NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM)); + if (map->mems[NCCL_NET_MAP_DEVMEM].size) { + CUDACHECK(hipIpcOpenMemHandle((void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].ipc, hipIpcMemLazyEnablePeerAccess)); + map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = NULL; + } + if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) { + void** sharedDevMemPtr = comm->proxyState.sharedDevMems+send->proxyConn.localRank; + if (*sharedDevMemPtr == NULL) { + CUDACHECK(hipIpcOpenMemHandle(sharedDevMemPtr, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipc, hipIpcMemLazyEnablePeerAccess)); + } + map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = (char*)(*sharedDevMemPtr); + map->mems[NCCL_NET_MAP_SHARED_DEVMEM].cpuPtr = NULL; + } + } + //NCCLCHECK(netDumpMap(map)); - if (resources->shared) { - // Get shared buffers - int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM; - NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc)); - resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc; - } + struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); + void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; + send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head; - if (resources->buffSizes[LOC_DEVMEM]) { - NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM])); - } - if (resources->buffSizes[LOC_HOSTMEM]) { - NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM])); - } + struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); + send->conn.tail = &recvMem->tail; + send->conn.sizesFifo = recvMem->sizesFifo; + // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree + send->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL; + + for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); return ncclSuccess; } /* Connect to this peer */ -ncclResult_t netRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { - // Setup device pointers - struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources; +static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { + struct connectMap* map; + NCCLCHECK(ncclCalloc(&map, 1)); + recv->transportResources = map; + NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), map, sizeof(struct connectMap))); + //NCCLCHECK(netDumpMap(map)); - // Finish connection establishment from remote peer - NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); - NCCLCHECK(ncclNetCloseListen(resources->netListenComm)); + struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); + recv->conn.head = &sendMem->head; - if (resources->shared) { - // Get shared buffers - int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM; - NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc)); - resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc; - } + struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); + void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; + recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail; + recv->conn.sizesFifo = recvMem->sizesFifo; + // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree + recv->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL; - if (resources->buffSizes[LOC_DEVMEM]) { - NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM])); - } - if (resources->buffSizes[LOC_HOSTMEM]) { - NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM])); + for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); + return ncclSuccess; +} + +static ncclResult_t sendFree(struct ncclConnector* send) { + struct connectMap* map = (struct connectMap*)(send->transportResources); + if (map->sameProcess == 0) { + NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); + if (map->mems[NCCL_NET_MAP_DEVMEM].size) { + CUDACHECK(hipIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); + } } return ncclSuccess; } -ncclResult_t netSendFree(void* transportResources) { - struct netSendResources* resources = (struct netSendResources*)transportResources; - NCCLCHECK(ncclCudaHostFree(resources->sendMem)); - NCCLCHECK(ncclCudaHostFree(resources->recvMem)); - for (int l=0; lbuffers[l]) - NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l])); +static ncclResult_t recvFree(struct ncclConnector* recv) { + return ncclSuccess; +} + +#define NCCL_SHARED_STEPS 16 +static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int localRank, int type, int sameProcess, + int nChannels, char** gpuPtr, char** cpuPtr, int* size, hipIpcMemHandle_t* ipc) { + if (cuda == 0 && sameProcess == 0) { + WARN("PXN should not use host buffers for data"); + return ncclInternalError; } - if (resources->shared == 0) { - NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM])); - CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM])); + struct ncclProxyProgressState* progressState = &comm->proxyState.progressState; + if (progressState->localPeers == NULL) { + NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks)); + } + struct ncclProxyPeer** localPeers = progressState->localPeers; + if (localPeers[localRank] == NULL) { + NCCLCHECK(ncclCalloc(localPeers+localRank, 1)); + } + struct ncclProxyPeer* peer = localPeers[localRank]; + struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv; + state->refcount++; + if (state->size == 0) { + state->size = nChannels*(NCCL_SHARED_STEPS/NCCL_STEPS)*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR; + } + + if (size) *size = state->size; + + if (cuda && state->cudaBuff == NULL) { + NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, cuda)); + if (sameProcess == 0) { + CUDACHECK(hipIpcGetMemHandle(&state->ipc, state->cudaBuff)); + } + } + if (!cuda && state->hostBuff == NULL) { + NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size)); + } + if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff; + if (sameProcess) { + if (gpuPtr) *gpuPtr = *cpuPtr; + } else { + if (gpuPtr) *gpuPtr = NULL; + if (ipc) memcpy(ipc, &state->ipc, sizeof(hipIpcMemHandle_t)); + } + return ncclSuccess; +} + +static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) { + // Use different pools for different channels and also separate send/recv. + int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR); + int globalSlot = (channel*NCCL_SHARED_STEPS)+slot; + *offset = slotSize * globalSlot; + return ncclSuccess; +} + +static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm, int localRank, int type) { + if (comm->proxyState.progressState.localPeers == NULL) NCCLCHECK(ncclInternalError); + struct ncclProxyPeer* peer = comm->proxyState.progressState.localPeers[localRank]; + if (peer == NULL) NCCLCHECK(ncclInternalError;) + struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv; + if (state->size == 0) NCCLCHECK(ncclInternalError); + state->refcount--; + if (state->refcount == 0) { + if (state->cudaBuff) CUDACHECK(hipFree(state->cudaBuff)); + if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff)); + } + if (peer->send.refcount || peer->recv.refcount) return ncclSuccess; + free(peer); + comm->proxyState.progressState.localPeers[localRank] = NULL; + for (int r=0; rlocalRanks; r++) { + if (comm->proxyState.progressState.localPeers[r]) return ncclSuccess; + } + // All peers are freed, free array + free(comm->proxyState.progressState.localPeers); + comm->proxyState.progressState.localPeers = NULL; + return ncclSuccess; +} + +static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels) { + int rank = comm->localRankToRank[connection->localRank]; + int sameProcess = comm->peerInfo[rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; + NCCLCHECK(sharedBuffersInit(comm, 1, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL)); + return ncclSuccess; +} + +static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct setupReq* req = (struct setupReq*) reqBuff; + if (reqSize != sizeof(struct setupReq)) return ncclInternalError; + + struct sendResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + connection->transportResources = resources; + + resources->rank = req->rank; + resources->localRank = req->localRank; + resources->remoteRank = req->remoteRank; + resources->netDev = req->netDev; + resources->shared = connection->shared = req->shared; + resources->useGdr = req->useGdr; + resources->channelId = req->channelId; + resources->connIndex = req->connIndex; + ncclNetProperties_t props; + NCCLCHECK(ncclNetGetProperties(req->netDev, &props)); + resources->maxRecvs = props.maxRecvs; + + // We don't return any data + if (respSize != 0) return ncclInternalError; + *done = 1; + return ncclSuccess; +} + +static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct setupReq* req = (struct setupReq*) reqBuff; + if (reqSize != sizeof(struct setupReq)) return ncclInternalError; + + struct recvResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + connection->transportResources = resources; + + resources->rank = req->rank; + resources->localRank = req->localRank; + resources->remoteRank = req->remoteRank; + resources->netDev = req->netDev; + resources->shared = connection->shared = req->shared; + resources->useGdr = req->useGdr; + resources->channelId = req->channelId; + resources->connIndex = req->connIndex; + ncclNetProperties_t props; + NCCLCHECK(ncclNetGetProperties(req->netDev, &props)); + resources->maxRecvs = props.maxRecvs; + + if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError; + NCCLCHECK(ncclNetListen(req->netDev, respBuff, &resources->netListenComm)); + *done = 1; + return ncclSuccess; +} + +static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct sendResources* resources = (struct sendResources*)(connection->transportResources); + if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError; + + if (resources->shared) { + // Shared buffers + struct ncclProxyProgressState* progressState = &comm->proxyState.progressState; + if (progressState->localPeers == NULL) { + NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks)); + } + struct ncclProxyPeer** localPeers = progressState->localPeers; + if (localPeers[resources->localRank] == NULL) { + NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1)); + } + connection->proxyAppendPtr = localPeers[resources->localRank]->send.proxyAppend+resources->channelId; + + if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { + // Connect or reuse connection for a netdev/remote rank. + if (progressState->netComms[resources->netDev] == NULL) { + NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks)); + } + struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank; + if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, comms->sendComm+resources->channelId)); + resources->netSendComm = comms->sendComm[resources->channelId]; + if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; + } else { + NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm)); + } + } else { + // Connect to remote peer + NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm)); + connection->proxyAppendPtr = &connection->proxyAppend; + } + if (resources->netSendComm == NULL) { + *done = 0; + return ncclSuccess; + } + *done = 1; + + // Create structures + struct connectMap* map = &resources->map; + map->sameProcess = + comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; + map->shared = resources->shared; + CUDACHECK(hipGetDevice(&map->cudaDev)); + + if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p + for (int p=0; puseGdr, comm->buffSizes[p], buffs[p]); + resources->buffSizes[p] = comm->buffSizes[p]; + } + } else { + // Get shared buffers + int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; + struct connectMapMem* mapMem = map->mems+bank; + NCCLCHECK(sharedBuffersInit( + comm, resources->useGdr, resources->localRank, 0, map->sameProcess, comm->p2pnChannels, + &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipc)); + resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + } + + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); + + if (map->mems[NCCL_NET_MAP_DEVMEM].size) { + if (resources->shared == 0) { + if (!map->sameProcess) { + ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN); + } + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, resources->useGdr)); + map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; + } + if (!map->sameProcess) { + CUDACHECK(hipIpcGetMemHandle(&map->mems[NCCL_NET_MAP_DEVMEM].ipc, map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); + } + } + if (map->sameProcess) { + NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); + map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; + } else { + NCCLCHECK(netCreateShm(map->mems+NCCL_NET_MAP_HOSTMEM)); + } + if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) { + uint64_t *cpuPtr, *gpuPtr; + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc)); + + resources->gdcSync = cpuPtr; + struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; + gdcMem->cpuPtr = (char*)cpuPtr; + gdcMem->gpuPtr = (char*)gpuPtr; + gdcMem->size = sizeof(uint64_t); // sendMem->head + } + + resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); + resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); + + // Don't give credits yet in shared mode. + resources->sendMem->head = map->shared ? -NCCL_STEPS : 0; + for (int i=0; irecvMem->sizesFifo[i] = -1; + + for (int p=0; pbuffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]); + if (resources->buffers[p]) { + NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); + } + } + + //NCCLCHECK(netDumpMap(map)); + if (respSize != sizeof(struct connectMap)) return ncclInternalError; + memcpy(respBuff, map, sizeof(struct connectMap)); + return ncclSuccess; +} + +static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + if (reqSize != sizeof(int)) return ncclInternalError; + struct recvResources* resources = (struct recvResources*)(connection->transportResources); + resources->proxyRank = *(int*)reqBuff; + + // Finish connection establishment from remote peer + if (resources->shared) { + // Shared buffers + struct ncclProxyProgressState* progressState = &comm->proxyState.progressState; + if (progressState->localPeers == NULL) { + NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks)); + } + struct ncclProxyPeer** localPeers = progressState->localPeers; + if (localPeers[resources->localRank] == NULL) { + NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1)); + } + connection->proxyAppendPtr = localPeers[resources->localRank]->recv.proxyAppend+resources->channelId; + + if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { + // Connect or reuse connection for a netdev/remote rank. + if (progressState->netComms[resources->netDev] == NULL) { + NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks)); + } + struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank; + if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(resources->netListenComm, comms->recvComm+resources->channelId)); + resources->netRecvComm = comms->recvComm[resources->channelId]; + if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++; + } else { + NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); + } + } else { + // Connect to remote peer + NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); + connection->proxyAppendPtr = &connection->proxyAppend; + } + if (resources->netRecvComm == NULL) { + *done = 0; + return ncclSuccess; + } + *done = 1; + NCCLCHECK(ncclNetCloseListen(resources->netListenComm)); + + // Create structures + struct connectMap* map = &resources->map; + map->sameProcess = + comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; + if (map->sameProcess == 0) return ncclInternalError; // We don't support remote proxy for recv + map->shared = resources->shared; + + if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p + for (int p=0; puseGdr, comm->buffSizes[p], buffs[p]); + resources->buffSizes[p] = comm->buffSizes[p]; + } + } else { + // Get shared buffers + int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; + struct connectMapMem* mapMem = map->mems+bank; + NCCLCHECK(sharedBuffersInit( + comm, resources->useGdr, resources->localRank, 1, 1, comm->p2pnChannels, + &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL)); + resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + } + + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); + + if (map->mems[NCCL_NET_MAP_DEVMEM].size) { + if (resources->shared == 0) { + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, resources->useGdr)); + map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; + } + } + NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); + map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; + if (ncclGdrCopy && map->sameProcess) { + uint64_t *cpuPtr, *gpuPtr; + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc)); + + if (ncclParamGdrCopySyncEnable()) { + resources->gdcSync = cpuPtr; + struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; + gdcMem->cpuPtr = (char*)cpuPtr; + gdcMem->gpuPtr = (char*)gpuPtr; + gdcMem->size = sizeof(uint64_t); + } + if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1; + } + + resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); + resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); + for (int p=0; pbuffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]); + if (resources->buffers[p]) { + NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); + } + } + + //NCCLCHECK(netDumpMap(map)); + if (respSize != sizeof(struct connectMap)) return ncclInternalError; + memcpy(respBuff, map, sizeof(struct connectMap)); + return ncclSuccess; +} + +static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + struct sendResources* resources = (struct sendResources*)(connection->transportResources); + if (resources == NULL) { // NVB Preconnect + NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0)); + return ncclSuccess; + } + for (int p=0; pbuffers[p]) { + NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[p])); + } + } + struct connectMapMem* mems = resources->map.mems; + if (resources->map.sameProcess) { + NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); + } else { + NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, NULL, mems[NCCL_NET_MAP_HOSTMEM].size)); + } + CUDACHECK(hipFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); + if (resources->shared) { + NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0)); + if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { + struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank; + comms->sendRefCount[resources->channelId]--; + if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comms->sendComm[resources->channelId])); + } else { + NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); + } + } else { + NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); } - NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); free(resources); return ncclSuccess; } -ncclResult_t netRecvFree(void* transportResources) { - struct netRecvResources* resources = (struct netRecvResources*)transportResources; - // GDRCOPY support - if (resources->gdrFlushDesc) { - NCCLCHECK(ncclGdrCudaFree(resources->gdrFlushDesc)); +static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + struct recvResources* resources = (struct recvResources*)(connection->transportResources); + if (resources == NULL) { // NVB Preconnect + NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1)); + return ncclSuccess; } - // GDRCOPY support - if (resources->gdrMemDesc) { - NCCLCHECK(ncclGdrCudaFree(resources->gdrMemDesc)); + for (int p=0; pbuffers[p]) { + NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[p])); + } } - NCCLCHECK(ncclCudaHostFree(resources->sendMem)); - NCCLCHECK(ncclCudaHostFree(resources->recvMem)); - for (int l=0; lbuffers[l]) - NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[l])); + struct connectMapMem* mems = resources->map.mems; + NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); + CUDACHECK(hipFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); + if (resources->shared) { + NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1)); + if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { + struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank; + comms->recvRefCount[resources->channelId]--; + if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comms->recvComm[resources->channelId])); + } else { + NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); + } + } else { + NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); } - if (resources->shared == 0) { - NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM])); - CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM])); - } - NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); free(resources); return ncclSuccess; } static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps"); -ncclResult_t netSendProxy(struct ncclProxyArgs* args) { +static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources); + struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->transmitted = sub->done = 0; + for (uint64_t step=0; stepnsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin); } args->state = ncclProxyOpProgress; args->hdp_flushed = 0; @@ -338,29 +793,33 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; + int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs); for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; if (sub->done == sub->nsteps) continue; - struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources); - void* mhandle = *(resources->mhandlesProto[p]); - int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS; - char* localBuff = sub->connector->conn.buffs[p]; + struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); + void* mhandle = resources->mhandles[p]; + int stepSize = resources->buffSizes[p] / NCCL_STEPS; + char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); int buffSize = stepSize*args->sliceSteps; - if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR; - if (sub->sendbytes < buffSize) buffSize = sub->sendbytes; + if (sub->nbytes < buffSize) buffSize = sub->nbytes; // Post buffers to the GPU - if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) { + if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) { int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; if (resources->shared) { - char* ptr; - int sharedBuffSlot = sub->posted%NCCL_STEPS; - NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 0, sub->channel->id, sharedBuffSlot, s, &ptr)); - resources->recvMem->ptrsFifo[buffSlot] = ptr; + int sharedBuffSlot = sub->posted%maxDepth; + int offset; + NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset)); + resources->recvMem->offsFifo[buffSlot] = offset; __sync_synchronize(); - volatile uint64_t* sendHead = &resources->sendMem->head; + volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; sub->posted += args->sliceSteps; *sendHead = sub->base + sub->posted - NCCL_STEPS; + if (resources->gdcSync) wc_store_fence(); // Flush out WC write } else sub->posted += args->sliceSteps; + for (uint64_t step=sub->posted-args->sliceSteps; stepposted; step++) { + ncclProfilingRecord(args, s, step, ncclProxyProfileSendGPUWait); + } args->idle = 0; continue; } @@ -372,7 +831,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) { // We have something to receive, let's check if it's completely ready. int size = sizesFifo[buffSlot]; - char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize; + char* buff = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize; int ready = 1; if (p == NCCL_PROTO_LL128) { ready = resources->useGdr; @@ -404,22 +863,14 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { STORE(resources->curr_hdp_reg, 1); } // Data is ready, try to send. - NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot)); + NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { -#ifdef ENABLE_PROFILING - if (sub->channel->active_req == 0) { - gettimeofday(&sub->channel->tvs, NULL); - sub->channel->sizes = 0; - } - sub->channel->active_req ++; - sub->channel->sizes += LOAD(sizesFifo+buffSlot); - sub->channel->send_byte += LOAD(sizesFifo+buffSlot); -#endif - TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]); + TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]); sizesFifo[buffSlot] = -1; // Make sure size is reset to zero before we update the head. __sync_synchronize(); sub->transmitted += args->sliceSteps; + for (uint64_t step=sub->transmitted-args->sliceSteps; steptransmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait); args->idle = 0; continue; } @@ -432,29 +883,14 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { int buffSlot = (sub->base+sub->done)%NCCL_STEPS; NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL)); if (done) { - TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]); -#ifdef ENABLE_PROFILING - if (args->protocol == NCCL_PROTO_SIMPLE) { - sub->channel->active_req --; - if (sub->channel->active_req == 0) { - struct timeval tv; - gettimeofday(&tv, NULL); - float delta = (tv.tv_sec - sub->channel->tvs.tv_sec)*1E6 + tv.tv_usec - sub->channel->tvs.tv_usec; - if (delta) { -#ifdef ENABLE_TIMING_PROFILE - sub->channel->bw_cumulative += (float)delta/1E3; -#else - sub->channel->bw_cumulative += (float)sub->channel->sizes/delta/1E3; -#endif - sub->channel->bw_count ++; - } - } - } -#endif + TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]); sub->done += args->sliceSteps; + for (uint64_t step=sub->done-args->sliceSteps; stepdone; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd); if (resources->shared == 0) { - resources->sendMem->head = sub->base + sub->done; + volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; + *sendHead = sub->base + sub->done; + if (resources->gdcSync) wc_store_fence(); // Flush out WC write } args->idle = 0; if (sub->done == sub->nsteps) { @@ -471,140 +907,203 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { return ncclSuccess; } -ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { +static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { + // Initialize subs and group them by same recvComm. + void* recvComm; + int groupSize = 0; + int maxRecvs = 1; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources); + if (groupSize == maxRecvs) { + groupSize = 0; + } else if (s>0) { // Find next sub with the same recvComm + int next; + for (next=s; nextnsubs; next++) { + struct recvResources* nextRes = (struct recvResources*) (args->subs[next].connection->transportResources); + if (nextRes->netRecvComm == recvComm) break; + } + if (next == args->nsubs) { // Not found + groupSize = 0; + } else if (s != next) { // We found a sub later with the same recvComm ; swap subs + struct ncclProxySubArgs temp; + memcpy(&temp, sub, sizeof(struct ncclProxySubArgs)); + memcpy(sub, args->subs+next, sizeof(struct ncclProxySubArgs)); + memcpy(args->subs+next, &temp, sizeof(struct ncclProxySubArgs)); + } + } + groupSize++; + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + maxRecvs = resources->maxRecvs; + recvComm = resources->netRecvComm; // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->transmitted = sub->done = 0; + for (int i=0; insteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin); } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; - for (int s=0; snsubs; s++) { - struct ncclProxySubArgs* sub = args->subs+s; - if (sub->done == sub->nsteps) continue; - struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources); - void* mhandle = *(resources->mhandlesProto[p]); - int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS; - char* localBuff = sub->connector->conn.buffs[p]; - int buffSize = stepSize*args->sliceSteps; - if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR; - if (sub->recvbytes < buffSize) buffSize = sub->recvbytes; + int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs); + for (int s=0; snsubs; s+=args->subs[s].groupSize) { + struct ncclProxySubArgs* subGroup = args->subs+s; + int subCount = 0; + void* ptrs[NCCL_PROXY_MAX_SUBS]; + int sizes[NCCL_PROXY_MAX_SUBS]; + int tags[NCCL_PROXY_MAX_SUBS]; + void* mhandles[NCCL_PROXY_MAX_SUBS]; - if ((sub->posted < sub->done + NCCL_STEPS) && (sub->posted < sub->nsteps)) { - int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; - char* ptr; - if (resources->shared) { - int sharedBuffSlot = sub->posted%NCCL_STEPS; - NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 1, sub->channel->id, sharedBuffSlot, s, &ptr)); - volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo; - ptrsFifo[buffSlot] = ptr; - } else { - ptr = localBuff+buffSlot*stepSize; - } - NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot)); - if (sub->requests[buffSlot] != NULL) { - TRACE(NCCL_NET, "recvProxy [%lu/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]); -#ifdef ENABLE_PROFILING - if (args->protocol == NCCL_PROTO_SIMPLE) { - if (sub->channel->active_req == 0) { - gettimeofday(&sub->channel->tvs, NULL); - sub->channel->sizes = 0; - } - sub->channel->active_req ++; + for (int i=0; igroupSize; i++) { + struct ncclProxySubArgs* sub = subGroup + i; + if (sub->posted < sub->nsteps) { + if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; } + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + int stepSize = resources->buffSizes[p] / NCCL_STEPS; + char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); + int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; + if (resources->shared) { + int sharedBuffSlot = sub->posted%maxDepth; + int offset; + NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset)); + volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo; + offsFifo[buffSlot] = offset; + ptrs[subCount] = localBuff+offset; + } else { + ptrs[subCount] = localBuff+buffSlot*stepSize; } -#endif - sub->posted += args->sliceSteps; - args->idle = 0; - continue; + sizes[subCount] = stepSize*args->sliceSteps; + if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes; + tags[subCount] = resources->remoteRank; + mhandles[subCount] = resources->mhandles[p]; + subCount++; } } - if (sub->posted > sub->received) { - int buffSlot = (sub->base+sub->received)%NCCL_STEPS; - int done, size; - NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, &size)); + if (subCount) { + uint64_t step = subGroup->posted; + struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); + void** requestPtr = subGroup->requests+(step%NCCL_STEPS); + NCCLCHECK(ncclNetIrecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr)); + if (*requestPtr) { + for (int i=0; igroupSize; i++) { + struct ncclProxySubArgs* sub = subGroup+i; + sub->posted += args->sliceSteps; + for (uint64_t step=sub->posted-args->sliceSteps; stepposted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait); + } + args->idle = 0; + } + } + } + if (args->idle == 0) return ncclSuccess; + + for (int s=0; snsubs; s+=args->subs[s].groupSize) { + struct ncclProxySubArgs* subGroup = args->subs+s; + if (subGroup->posted > subGroup->received) { + uint64_t step = subGroup->received; + int done; + void* ptrs[NCCL_PROXY_MAX_SUBS]; + int sizes[NCCL_PROXY_MAX_SUBS]; + void* mhandles[NCCL_PROXY_MAX_SUBS]; + for (int i=0; irequests[step%NCCL_STEPS], &done, sizes)); if (done) { - sub->received += args->sliceSteps; -#ifdef ENABLE_PROFILING - if (args->protocol == NCCL_PROTO_SIMPLE) { - sub->channel->active_req --; - sub->channel->sizes += size; - sub->channel->recv_byte += size; - if (sub->channel->active_req == 0) { - struct timeval tv; - gettimeofday(&tv, NULL); - float delta = (tv.tv_sec - sub->channel->tvs.tv_sec)*1E6 + tv.tv_usec - sub->channel->tvs.tv_usec; - if (delta) { -#ifdef ENABLE_TIMING_PROFILE - sub->channel->bw_cumulative += (float)delta/1E3; -#else - sub->channel->bw_cumulative += (float)sub->channel->sizes/delta/1E3; -#endif - sub->channel->bw_count ++; - } + int useGdr = 0; + int totalSize = 0; + for (int i=0; igroupSize; i++) { + struct ncclProxySubArgs* sub = subGroup + i; + sub->received += args->sliceSteps; + for (uint64_t step=sub->received-args->sliceSteps; stepreceived; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait); + if (step < sub->nsteps) { + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + if (resources->useGdr) useGdr = 1; } } -#endif - if (size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) { - // Don't pass data to the GPU yet, flush first. - + subGroup->requests[step%NCCL_STEPS] = NULL; + if (totalSize > 0 && p == NCCL_PROTO_SIMPLE && useGdr) { // GDRCOPY support - if (resources->devFlushMem) { + struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); + if (resources->gdcFlush) { #if defined (__x86_64__) // Force a PCI-E read from GPU memory - asm volatile ("mov (%0), %%eax" :: "l"(resources->devFlushMem) : "%eax"); + asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax"); #else WARN("NET: GDR Flush only supported on x86_64"); return ncclInternalError; #endif - sub->requests[buffSlot] = NULL; } else { - volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo; - char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize; - NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, sub->requests+buffSlot)); + int subCount = 0; + for (int i=0; igroupSize; i++) { + struct ncclProxySubArgs* sub = subGroup + i; + if (step < sub->nsteps) { + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + int stepSize = resources->buffSizes[p] / NCCL_STEPS; + char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); + int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; + ptrs[subCount] = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize; + mhandles[subCount] = resources->mhandles[p]; + subCount++; + } + } + struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); + NCCLCHECK(ncclNetIflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS))); } - } else { - sub->requests[buffSlot] = NULL; } args->idle = 0; - continue; } } - if (sub->received > sub->transmitted) { - // Progress flush operations - int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; + } + if (args->idle == 0) return ncclSuccess; + + for (int s=0; snsubs; s+=args->subs[s].groupSize) { + struct ncclProxySubArgs* subGroup = args->subs+s; + if (subGroup->received > subGroup->transmitted) { + uint64_t step = subGroup->transmitted; int done = 1; - if (sub->requests[buffSlot]) NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL)); + void* request = subGroup->requests[step%NCCL_STEPS]; + if (request) NCCLCHECK(ncclNetTest(request, &done, NULL)); if (done) { - sub->transmitted += args->sliceSteps; - __sync_synchronize(); - if (resources->devRecvMem) { - // GDRCOPY support: Write updated tail directly to the device memory - resources->devRecvMem->tail = sub->base + sub->transmitted; - wc_store_fence(); // Flush out WC write - } else { - resources->recvMem->tail = sub->base + sub->transmitted; + for (int i=0; igroupSize; i++) { + struct ncclProxySubArgs* sub = subGroup + i; + sub->transmitted += args->sliceSteps; + for (uint64_t step=sub->transmitted-args->sliceSteps; steptransmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait); + if (step < sub->nsteps) { + __sync_synchronize(); + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail; + *recvTail = sub->base + sub->transmitted; + if (resources->gdcSync) wc_store_fence(); // Flush out WC write + } } args->idle = 0; - continue; } } - if (sub->transmitted > sub->done) { - volatile uint64_t* sendHead = &resources->sendMem->head; - uint64_t done = *sendHead; - while (done > sub->base + sub->done && - // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted. - sub->transmitted > sub->done) { - sub->done += args->sliceSteps; - args->idle = 0; - if (sub->done == sub->nsteps) { - resources->step = sub->base + sub->nsteps; - args->done++; + } + if (args->idle == 0) return ncclSuccess; + + for (int s=0; snsubs; s+=args->subs[s].groupSize) { + struct ncclProxySubArgs* subGroup = args->subs+s; + for (int i=0; igroupSize; i++) { + struct ncclProxySubArgs* sub = subGroup + i; + if (sub->done == sub->nsteps) continue; + if (sub->transmitted > sub->done) { + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + volatile uint64_t* sendHead = &resources->sendMem->head; + uint64_t done = *sendHead; + while (done > sub->base + sub->done && + // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted. + sub->transmitted > sub->done) { + sub->done += args->sliceSteps; + for (uint64_t step=sub->done-args->sliceSteps; stepdone; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd); + args->idle = 0; + if (sub->done == sub->nsteps) { + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + resources->step = sub->base + sub->nsteps; + args->done++; + break; + } } } } @@ -618,7 +1117,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { struct ncclTransport netTransport = { "NET", - netCanConnect, - { netSendSetup, netSendConnect, netSendFree, netSendProxy }, - { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy } + canConnect, + { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress }, + { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress } }; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 2825a967e5..f6bd92ce06 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -21,26 +21,44 @@ #include #include #include +#define ENABLE_TIMER 0 +#include "timer.h" #include "ibvwrap.h" #define USE_RDMA_WRITE 1 #define MAXNAMESIZE 64 static char ncclIbIfName[MAX_IF_NAME_SIZE+1]; -static union socketAddress ncclIbIfAddr; +static union ncclSocketAddress ncclIbIfAddr; + +struct ncclIbMr { + uintptr_t addr; + int pages; + int refs; + ibv_mr *mr; +}; + +struct ncclIbMrCache { + struct ncclIbMr *slots; + int capacity, population; +}; static int ncclNIbDevs = -1; -struct ncclIbDev { +struct alignas(64) ncclIbDev { + pthread_mutex_t lock; int device; uint64_t guid; uint8_t port; uint8_t link; int speed; ibv_context* context; + int pdRefs; + ibv_pd* pd; char devName[MAXNAMESIZE]; char* pciPath; int realPort; int maxQp; + struct ncclIbMrCache mrCache; }; #define MAX_IB_PORT 15 @@ -53,6 +71,7 @@ struct userIbDev { struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; struct userIbDev userIbDevs[MAX_IB_DEVS]; pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER; +static int ncclIbRelaxedOrderingEnabled = 0; NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0); NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14); @@ -62,6 +81,7 @@ NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0); NCCL_PARAM(IbSl, "IB_SL", 0); NCCL_PARAM(IbTc, "IB_TC", 0); NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192); +NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2); pthread_t ncclIbAsyncThread; static void* ncclIbAsyncThreadMain(void* args) { @@ -115,17 +135,28 @@ static int ncclIbSpeed(int speed) { return ibvSpeeds[firstBitSet(speed, sizeof(ibvSpeeds)/sizeof(int)-1)]; } +// Determine whether RELAXED_ORDERING is enabled and possible +static int ncclIbRelaxedOrderingCapable(void) { + int roMode = ncclParamIbPciRelaxedOrdering(); + ncclResult_t r = ncclInternalError; + if (roMode == 1 || roMode == 2) { + // Query IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support + r = wrap_ibv_reg_mr_iova2(NULL, NULL, NULL, 0, 0, 0); + } + return r == ncclInternalError ? 0 : 1; +} + ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { + if (ncclParamIbDisable()) return ncclInternalError; static int shownIbHcaEnv = 0; if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; } - if (ncclParamIbDisable()) return ncclInternalError; if (ncclNIbDevs == -1) { pthread_mutex_lock(&ncclIbLock); wrap_ibv_fork_init(); if (ncclNIbDevs == -1) { ncclNIbDevs = 0; - if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) { + if (ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) { WARN("NET/IB : No IP interface found."); return ncclInternalError; } @@ -176,18 +207,27 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { } TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); + pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL); ncclIbDevs[ncclNIbDevs].device = d; ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid; ncclIbDevs[ncclNIbDevs].port = port; ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width); ncclIbDevs[ncclNIbDevs].context = context; + ncclIbDevs[ncclNIbDevs].pdRefs = 0; + ncclIbDevs[ncclNIbDevs].pd = NULL; strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort)); ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp; + ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0; + ncclIbDevs[ncclNIbDevs].mrCache.population = 0; + ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL; + + pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); + pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d + ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs); ncclNIbDevs++; nPorts++; - pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); // [RCCL] pthread_detach(ncclIbAsyncThread); // [/RCCL] @@ -201,13 +241,16 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { } else { char line[1024]; line[0] = '\0'; + // Determine whether RELAXED_ORDERING is enabled and possible + ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable(); for (int d=0; dname = ncclIbDevs[dev].devName; props->pciPath = ncclIbDevs[dev].pciPath; @@ -255,18 +300,23 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) { props->ptrSupport |= NCCL_PTR_CUDA; } props->speed = ncclIbDevs[dev].speed; + props->latency = 0; // Not set props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort; props->maxComms = ncclIbDevs[dev].maxQp; + props->maxRecvs = NCCL_NET_IB_MAX_RECVS; return ncclSuccess; } -#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS +// We need to support NCCL_NET_MAX_REQUESTS for each concurrent receive +#define MAX_REQUESTS (NCCL_NET_MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS) +static_assert(MAX_REQUESTS <= 256, "request id are encoded in wr_id and we need up to 8 requests ids per completion"); #define NCCL_IB_MAX_QPS 128 struct ncclIbQpInfo { uint32_t lid; uint8_t ib_port; + uint8_t link_layer; uint32_t qpn[NCCL_IB_MAX_QPS]; // For RoCE @@ -279,46 +329,83 @@ struct ncclIbQpInfo { uint64_t fifoAddr; }; -struct ncclIbHandle { - union socketAddress connectAddr; +enum ncclIbCommState { + ncclIbCommStateStart = 0, + ncclIbCommStateConnect = 1, + ncclIbCommStateAccept = 3, + ncclIbCommStateSend = 4, + ncclIbCommStateRecv = 5, + ncclIbCommStateConnected = 6, }; +struct ncclIbCommStage { + enum ncclIbCommState state; + int offset; + void* buffer; + void* comm; +}; + +struct ncclIbHandle { + union ncclSocketAddress connectAddr; // Filled by the target + struct ncclIbCommStage stage; // Used by the other side when connecting +}; + +#define NCCL_NET_IB_REQ_UNUSED 0 +#define NCCL_NET_IB_REQ_SEND 1 +#define NCCL_NET_IB_REQ_RECV 2 +#define NCCL_NET_IB_REQ_FLUSH 3 + struct ncclIbRequest { - int used; - int type; struct ncclIbVerbs* verbs; + int type; int events; - int size; - union socketAddress *addr; + union ncclSocketAddress *addr; + int nreqs; + union { + struct { + int size; + void* data; + uint32_t lkey; + int offset; + } send; + struct { + int sizes[NCCL_NET_IB_MAX_RECVS]; + } recv; + }; }; struct ncclIbVerbs { - struct ibv_pd* pd; + int dev; + struct ibv_pd* pd; // duplicate of ncclIbDevs[dev].pd struct ibv_cq* cq; - uint64_t pad[2]; + uint64_t pad[1]; struct ncclIbRequest reqs[MAX_REQUESTS]; }; struct ncclIbListenComm { int dev; - int fd; + struct ncclSocket sock; + struct ncclIbCommStage stage; }; struct alignas(64) ncclIbSendFifo { uint64_t addr; int size; - uint32_t seq; uint32_t rkey; - uint32_t ready; - uint64_t pad[1]; // Pad FIFO element size to be 32-bytes + uint32_t nreqs; + uint32_t tag; + uint64_t idx; }; struct ncclIbSendComm { struct ncclIbVerbs verbs; - struct ncclIbSendFifo fifo[MAX_REQUESTS]; - uint32_t fifoHead; - int fd; - union socketAddress addr; + struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; + uint64_t fifoHead; + struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; + struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1]; + struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS]; + struct ncclSocket sock; + int ready; struct ibv_qp* qps[NCCL_IB_MAX_QPS]; int nqps; @@ -339,10 +426,10 @@ struct ncclIbGpuFlush { }; struct ncclIbRemFifo { - struct ncclIbSendFifo elems[MAX_REQUESTS]; + struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; + uint64_t fifoTail; uint64_t addr; uint32_t rkey; - uint32_t tail; uint32_t flags; struct ibv_mr* mr; struct ibv_sge sge; @@ -351,8 +438,7 @@ struct ncclIbRemFifo { struct ncclIbRecvComm { struct ncclIbVerbs verbs; struct ncclIbRemFifo remFifo; - int fd; - union socketAddress addr; + struct ncclSocket sock; int ready; struct ibv_qp* qps[NCCL_IB_MAX_QPS]; int nqps; @@ -362,17 +448,39 @@ static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendC NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1); -ncclResult_t ncclIbInitVerbs(ibv_context* ctx, struct ncclIbVerbs* verbs) { - NCCLCHECK(wrap_ibv_alloc_pd(&verbs->pd, ctx)); +ncclResult_t ncclIbInitVerbs(int dev, struct ibv_context* ctx, struct ncclIbVerbs* verbs) { + verbs->dev = dev; + + pthread_mutex_lock(&ncclIbDevs[dev].lock); + if (0 == ncclIbDevs[dev].pdRefs++) { + ncclResult_t res; + NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ncclIbDevs[dev].pd, ctx), res, failure); + if (0) { + failure: + pthread_mutex_unlock(&ncclIbDevs[dev].lock); + return res; + } + } + verbs->pd = ncclIbDevs[dev].pd; + pthread_mutex_unlock(&ncclIbDevs[dev].lock); + // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv). NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0)); return ncclSuccess; } ncclResult_t ncclIbDestroyVerbs(struct ncclIbVerbs* verbs) { + ncclResult_t res; NCCLCHECK(wrap_ibv_destroy_cq(verbs->cq)); - NCCLCHECK(wrap_ibv_dealloc_pd(verbs->pd)); - return ncclSuccess; + + pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); + if (0 == --ncclIbDevs[verbs->dev].pdRefs) { + NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[verbs->dev].pd), res, returning); + } + res = ncclSuccess; +returning: + pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock); + return res; } ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int access_flags, struct ibv_qp** qp) { @@ -398,7 +506,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce return ncclSuccess; } -ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) { +ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) { struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_RTR; @@ -407,7 +515,7 @@ ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) { qpAttr.rq_psn = 0; qpAttr.max_dest_rd_atomic = 1; qpAttr.min_rnr_timer = 12; - if (info->lid == 0) { + if (info->link_layer == IBV_LINK_LAYER_ETHERNET) { qpAttr.ah_attr.is_global = 1; qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn; qpAttr.ah_attr.grh.dgid.global.interface_id = info->iid; @@ -426,7 +534,7 @@ ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) { return ncclSuccess; } -ncclResult_t ncclIbRtsQp(ibv_qp* qp) { +ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) { struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_RTS; @@ -439,33 +547,56 @@ ncclResult_t ncclIbRtsQp(ibv_qp* qp) { return ncclSuccess; } - ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { struct ncclIbListenComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large"); + memset(handle, 0, sizeof(struct ncclIbHandle)); comm->dev = dev; - NCCLCHECK(GetSocketAddr(&(handle->connectAddr))); - NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr)); + NCCLCHECK(GetSocketAddr(&comm->sock.addr)); + NCCLCHECK(ncclSocketListen(&comm->sock)); + memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress)); *listenComm = comm; return ncclSuccess; } ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { - struct ncclIbSendComm* comm; - NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm))); - struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; - NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr)); - *sendComm = comm; + enum ncclSocketState conState; + struct ncclIbCommStage* stage = &handle->stage; + struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm; + *sendComm = NULL; - comm->addr = handle->connectAddr; + if (stage->state == ncclIbCommStateConnect) goto ib_connect_check; + if (stage->state == ncclIbCommStateSend) goto ib_send; + if (stage->state != ncclIbCommStateStart) { + WARN("Error: trying to connect already connected sendComm"); + return ncclInternalError; + } + + NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm))); + NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, NULL, 1)); + stage->comm = comm; + stage->state = ncclIbCommStateConnect; + NCCLCHECK(ncclSocketConnect(&comm->sock)); + +ib_connect_check: + /* since ncclSocketConnect is async, we must check if connection is complete */ + NCCLCHECK(ncclGetSocketState(&comm->sock, &conState)); + if (conState == ncclSocketConnecting) { + /* expect user to call again */ + return ncclSuccess; + } else if (conState == ncclSocketError) { + return ncclSystemError; + } // IB Setup - ibv_context* ctx = ncclIbDevs[dev].context; - NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs)); - uint8_t ib_port = ncclIbDevs[dev].port; + struct ibv_context* ctx; + ctx = ncclIbDevs[dev].context; + NCCLCHECK(ncclIbInitVerbs(dev, ctx, &comm->verbs)); + uint8_t ib_port; + ib_port = ncclIbDevs[dev].port; comm->nqps = ncclParamIbQpsPerConn(); for (int q=0; qnqps; q++) { NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, comm->qps+q)); @@ -480,13 +611,14 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { qpInfo.mtu = portAttr.active_mtu; // Prepare my fifo - NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); + NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); qpInfo.fifoRkey = comm->fifoMr->rkey; qpInfo.fifoAddr = (uint64_t)comm->fifo; // RoCE support qpInfo.lid = portAttr.lid; - if (qpInfo.lid) { // IB + qpInfo.link_layer = portAttr.link_layer; + if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB for (int q=0; qnqps; q++) INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid); } else { // RoCE @@ -498,7 +630,19 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); } - NCCLCHECK(socketSend(comm->fd, &comm->addr, &qpInfo, sizeof(qpInfo))); + stage->state = ncclIbCommStateSend; + stage->offset = 0; + NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(qpInfo))); + memcpy(stage->buffer, &qpInfo, sizeof(qpInfo)); + +ib_send: + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset)); + if (stage->offset != sizeof(qpInfo)) + return ncclSuccess; + + free(stage->buffer); + stage->state = ncclIbCommStateConnected; + *sendComm = comm; return ncclSuccess; } @@ -506,24 +650,53 @@ NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0); ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm; - struct ncclIbRecvComm* rComm; - NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm))); + struct ncclIbCommStage* stage = &lComm->stage; + struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm; + *recvComm = NULL; + + if (stage->state == ncclIbCommStateAccept) goto ib_accept; + if (stage->state == ncclIbCommStateRecv) goto ib_recv; + if (stage->state == ncclIbCommStateSend) goto ib_send; + if (stage->state != ncclIbCommStateStart) { + WARN("Listencomm in unknown state %d\n", stage->state); + return ncclInternalError; + } + + NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm))); + stage->comm = rComm; + stage->state = ncclIbCommStateAccept; + lComm->sock.asyncFlag = 1; + rComm->sock.asyncFlag = 1; + +ib_accept: + NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock)); + if (rComm->sock.fd == -1) + return ncclSuccess; - socklen_t socklen = sizeof(union socketAddress); - SYSCHECKVAL(accept(lComm->fd, &rComm->addr.sa, &socklen), "accept", rComm->fd); struct ncclIbQpInfo remQpInfo; - NCCLCHECK(socketRecv(rComm->fd, &rComm->addr, &remQpInfo, sizeof(remQpInfo))); + stage->state = ncclIbCommStateRecv; + stage->offset = 0; + NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo))); +ib_recv: + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset)); + if (stage->offset != sizeof(remQpInfo)) + return ncclSuccess; + + /* copy back the received info */ + memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo)); // IB setup - ibv_context* ctx = ncclIbDevs[lComm->dev].context; - uint8_t ib_port = ncclIbDevs[lComm->dev].port; + struct ibv_context* ctx; + uint8_t ib_port; + ctx = ncclIbDevs[lComm->dev].context; + ib_port = ncclIbDevs[lComm->dev].port; struct ibv_port_attr portAttr; NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr)); union ibv_gid gid; NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid)); // QP Creation - NCCLCHECK(ncclIbInitVerbs(ctx, &rComm->verbs)); + NCCLCHECK(ncclIbInitVerbs(lComm->dev, ctx, &rComm->verbs)); rComm->nqps = ncclParamIbQpsPerConn(); for (int q=0; qnqps; q++) { NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_REMOTE_WRITE, rComm->qps+q)); @@ -542,8 +715,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { // Retain remote fifo info and prepare my RDMA ops rComm->remFifo.rkey = remQpInfo.fifoRkey; rComm->remFifo.addr = remQpInfo.fifoAddr; - NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ)); - rComm->remFifo.sge.length = sizeof(struct ncclIbSendFifo); + NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ)); rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey; if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE; @@ -557,6 +729,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->gpuFlush.qp)); struct ncclIbQpInfo localQpInfo; localQpInfo.lid=portAttr.lid; + localQpInfo.link_layer=portAttr.link_layer; localQpInfo.ib_port=ib_port; localQpInfo.spn=gid.global.subnet_prefix; localQpInfo.iid=gid.global.interface_id; @@ -568,26 +741,39 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { // Fill Handle struct ncclIbQpInfo qpInfo; qpInfo.lid=portAttr.lid; + qpInfo.link_layer=portAttr.link_layer; qpInfo.ib_port=ib_port; for (int q=0; qnqps; q++) qpInfo.qpn[q]=rComm->qps[q]->qp_num; qpInfo.spn=gid.global.subnet_prefix; qpInfo.iid=gid.global.interface_id; qpInfo.mtu=remQpInfo.mtu; - NCCLCHECK(socketSend(rComm->fd, &rComm->addr, &qpInfo, sizeof(qpInfo))); + stage->state = ncclIbCommStateSend; + stage->offset = 0; + if (stage->buffer) free(stage->buffer); + NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo))); + memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo)); +ib_send: + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset)); + if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess; + + free(stage->buffer); *recvComm = rComm; + + /* reset lComm stage */ + stage->state = ncclIbCommStateStart; + stage->offset = 0; + stage->comm = NULL; + stage->buffer = NULL; return ncclSuccess; } ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** req) { for (int i=0; ireqs+i; - if (r->used == 0) { - r->used = 1; - r->type = 0; + if (r->type == NCCL_NET_IB_REQ_UNUSED) { r->verbs = verbs; r->events = 1; - r->size = -1; r->addr = NULL; *req = r; return ncclSuccess; @@ -598,7 +784,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** return ncclInternalError; } ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) { - r->used = 0; + r->type = NCCL_NET_IB_REQ_UNUSED; return ncclSuccess; } @@ -607,9 +793,9 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) { // Do not block on this receive, return if not ready. int bytes = 0; - NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &remQpInfo, sizeof(remQpInfo), &bytes)); + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes)); if (bytes == 0) return ncclSuccess; // Try again later - NCCLCHECK(socketWait(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &remQpInfo, sizeof(remQpInfo), &bytes)); + NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes)); for (int q=0; qnqps; q++) { struct ibv_qp* qp = comm->qps[q]; @@ -618,7 +804,7 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) { } comm->ready = 1; // Block until this is done. It *should* not block indefinitely. - NCCLCHECK(socketSend(comm->fd, &comm->addr, &comm->ready, sizeof(int))); + NCCLCHECK(ncclSocketSend(&comm->sock, &comm->ready, sizeof(int))); return ncclSuccess; } @@ -626,39 +812,172 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) { ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) { // Do not block on this receive, return if not ready. int bytes = 0; - NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &comm->ready, sizeof(int), &bytes)); + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes)); if (bytes == 0) return ncclSuccess; // Try again later - NCCLCHECK(socketWait(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &comm->ready, sizeof(int), &bytes)); + NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes)); return ncclSuccess; } ncclResult_t ncclIbTest(void* request, int* done, int* size); -#define REG_ALIGN (4096) - ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) { static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset"); - struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; - uint64_t addr = (uint64_t)data; assert(size > 0); - // Deregister / register - uint64_t regAddr = addr & (~(REG_ALIGN-1)); - uint64_t regSize = addr+size - regAddr; - regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN; - struct ibv_mr* mr; - NCCLCHECK(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); - *mhandle = (void*)mr; - TRACE(NCCL_INIT,"regAddr %lx size %ld rkey %x", regAddr, regSize, mr->rkey); - return ncclSuccess; + static __thread uintptr_t pageSize = 0; + if (pageSize == 0) pageSize = sysconf(_SC_PAGESIZE); + + struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; + struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache; + uintptr_t addr = (uintptr_t)data & -pageSize; + int pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; + ncclResult_t res; + pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); + for (int slot=0; /*true*/; slot++) { + if (slot == cache->population) { // didn't find in cache + if (cache->population == cache->capacity) { // must grow cache + cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity; + NCCLCHECKGOTO(ncclRealloc(&cache->slots, cache->population, cache->capacity), res, returning); + } + // Deregister / register + struct ibv_mr* mr; + unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ; + if (ncclIbRelaxedOrderingEnabled) { + // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support + NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, (uintptr_t)addr, flags|IBV_ACCESS_RELAXED_ORDERING), res, returning); + } + else { + NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning); + } + TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey); + cache->population += 1; + cache->slots[slot].addr = addr; + cache->slots[slot].pages = pages; + cache->slots[slot].refs = 1; + cache->slots[slot].mr = mr; + *mhandle = (void*)mr; + res = ncclSuccess; + goto returning; + } + else if (cache->slots[slot].addr == addr && cache->slots[slot].pages == pages) { + cache->slots[slot].refs += 1; + *mhandle = (void*)cache->slots[slot].mr; + res = ncclSuccess; + goto returning; + } + } +returning: + pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock); + return res; } ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { - NCCLCHECK(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle)); + struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; + struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache; + ncclResult_t res; + pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); + for (int i=0; i < cache->population; i++) { + if (mhandle == cache->slots[i].mr) { + if (0 == --cache->slots[i].refs) { + memmove(&cache->slots[i], &cache->slots[--cache->population], sizeof(struct ncclIbMr)); + if (cache->population == 0) { + free(cache->slots); + cache->slots = NULL; + cache->capacity = 0; + } + NCCLCHECKGOTO(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle), res, returning); + } + res = ncclSuccess; + goto returning; + } + } + WARN("NET/IB: could not find mr %p inside cache of %d entries", mhandle, cache->population); + res = ncclInternalError; +returning: + pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock); + return res; +} + +ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { + struct ncclIbRequest** reqs = comm->fifoReqs[slot]; + volatile struct ncclIbSendFifo* slots = comm->fifo[slot]; + int nreqs = slots[0].nreqs; + if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; + + uint64_t wr_id = 0ULL; + + for (int r=0; rwrs+r; + memset(wr, 0, sizeof(struct ibv_send_wr)); + + struct ibv_sge* sge = comm->sges+r; + sge->addr=(uintptr_t)reqs[r]->send.data; + sge->lkey=reqs[r]->send.lkey; + + wr->opcode = IBV_WR_RDMA_WRITE; + wr->send_flags = 0; + wr->wr.rdma.remote_addr = slots[r].addr; + wr->wr.rdma.rkey = slots[r].rkey; + wr->next = wr+1; + wr_id += (reqs[r] - comm->verbs.reqs) << (r*8); + } + + // Write size as immediate data. In the case of multi-send, only write + // 0 or 1 as size to indicate whether there was data sent or received. + uint64_t immData = 0; + if (nreqs == 1) { + immData = reqs[0]->send.size; + } else { + uint8_t* multiImmData = (uint8_t*)&immData; + for (int r=0; rsend.size ? 1 : 0; + } + } + + struct ibv_send_wr* lastWr = comm->wrs+nreqs-1; + if (nreqs > 1 || reqs[0]->send.size > ncclParamIbArThreshold()) { + // When using adaptive routing, send the bulk of the data first as an + // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote + // completion. + lastWr++; + memset(lastWr, 0, sizeof(struct ibv_send_wr)); + } + lastWr->wr_id = wr_id; + lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + lastWr->imm_data = immData; + lastWr->next = NULL; + lastWr->send_flags = IBV_SEND_SIGNALED; + + // Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work + const int align = 128; + for (int q=0; qnqps; q++) { + for (int r=0; rsend.size, comm->nqps), align) * align; + int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize); + if (length <= 0) { + comm->wrs[r].sg_list = NULL; + comm->wrs[r].num_sge = 0; + } else { + comm->sges[r].length = length; + comm->wrs[r].sg_list = comm->sges+r; + comm->wrs[r].num_sge = 1; + } + } + struct ibv_send_wr* bad_wr; + NCCLCHECK(wrap_ibv_post_send(comm->qps[q], comm->wrs, &bad_wr)); + + for (int r=0; rsend.size, comm->nqps), align) * align; + reqs[r]->send.offset += chunkSize; + comm->sges[r].addr += chunkSize; + comm->wrs[r].wr.rdma.remote_addr += chunkSize; + } + } + return ncclSuccess; } -ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { +ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm)); if (comm->ready == 0) { *request = NULL; return ncclSuccess; } @@ -666,108 +985,89 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo struct ibv_mr* mr = (struct ibv_mr*)mhandle; // Wait for the receiver to have posted the corresponding receive - volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS); - volatile uint32_t * readyPtr = &slot->ready; - if (*readyPtr == 0) { *request = NULL; return ncclSuccess; } + int nreqs = 0; + volatile struct ncclIbSendFifo* slots; - struct ncclIbRequest* req; - NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); - req->size = size; - req->addr = &comm->addr; + int slot = (comm->fifoHead)%MAX_REQUESTS; + struct ncclIbRequest** reqs = comm->fifoReqs[slot]; + slots = comm->fifo[slot]; + int idx = comm->fifoHead+1; + if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; } + nreqs = slots[0].nreqs; + // Wait until all data has arrived + for (int r=1; rlkey; - -#if USE_RDMA_WRITE == 0 - wr[0].opcode = IBV_WR_SEND; - wr[0].send_flags = IBV_SEND_SIGNALED; -#else - __sync_synchronize(); // order the readyPtr load against rkey load below - // Sanity checks to catch user collective call count/size mismatches - // plus any potential programming errors - if (size > slot->size || slot->size < 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) { - char line[SOCKET_NAME_MAXLEN+1]; - WARN("NET/IB : peer %s collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x", - socketToString(req->addr, line), size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead); - return ncclInternalError; - } - wr[0].opcode = IBV_WR_RDMA_WRITE_WITH_IMM; - wr[0].send_flags = IBV_SEND_SIGNALED; - wr[0].wr.rdma.remote_addr = slot->addr; - wr[0].wr.rdma.rkey = slot->rkey; - wr[0].imm_data = size; // Send the message size via imm_data - __sync_synchronize(); -#endif - // We must clear slot->ready, but reset other fields to aid - // debugging and sanity checks - slot->ready = 0; - slot->addr = 0ULL; - slot->rkey = slot->size = slot->seq = 0; - comm->fifoHead++; - - -#if USE_RDMA_WRITE - // When using adaptive routing, send the bulk of the data first as an - // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote - // completion. - if (size > ncclParamIbArThreshold()) { - memset(&wr[1], 0, sizeof(wr[1])); - memcpy(&wr[1], &wr[0], sizeof(wr[0])); - wr[1].sg_list = NULL; - wr[1].num_sge = 0; - wr[0].next = &wr[1]; - - wr[0].opcode = IBV_WR_RDMA_WRITE; - wr[1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM; - - wr[0].send_flags = 0; - wr[1].send_flags = IBV_SEND_SIGNALED; - } -#endif - - int chunkSize = std::max(8, DIVUP(size, comm->nqps)); - - int offset = 0; - for (int q=0; qnqps; q++) { - int length = std::min(size-offset, chunkSize); - if (length <= 0) { - wr[0].sg_list = NULL; - wr[0].num_sge = 0; - } else { - sge.length = length; - wr[0].sg_list = &sge; - wr[0].num_sge = 1; + // Sanity checks to catch user collective call count/size mismatches + if (size > slots[r].size) { + char line[SOCKET_NAME_MAXLEN+1]; + WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error, local size %d remote size %d", + r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size); + return ncclInvalidUsage; + } // plus any potential programming errors + else if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) { + char line[SOCKET_NAME_MAXLEN+1]; + WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x", + r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), slots[r].size, slots[r].addr, slots[r].rkey); + return ncclInternalError; } - struct ibv_send_wr* bad_wr; - NCCLCHECK(wrap_ibv_post_send(comm->qps[q], wr, &bad_wr)); - offset += chunkSize; - sge.addr += chunkSize; - wr[0].wr.rdma.remote_addr += chunkSize; - } - req->events = comm->nqps; + struct ncclIbRequest* req; + NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); + req->type = NCCL_NET_IB_REQ_SEND; + req->addr = &comm->sock.addr; + req->verbs = &comm->verbs; + req->nreqs = nreqs; + req->send.size = size; + req->send.data = data; + req->send.lkey = mr->lkey; + req->send.offset = 0; + req->addr = &comm->sock.addr; + req->events = comm->nqps; + *request = reqs[r] = req; - *request = req; + // If this is a multi-recv, send only when all requests have matched. + for (int r=0; rnreqs, as well as other fields to help debugging and sanity checks + memset((void*)slots, 0, sizeof(struct ncclIbSendFifo)); + memset(reqs, 0, NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbRequest*)); + comm->fifoHead++; + TIME_STOP(0); + return ncclSuccess; + } + + *request = NULL; return ncclSuccess; } -ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size, struct ncclIbRequest* req) { +ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) { struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); - int slot = comm->remFifo.tail%MAX_REQUESTS; - struct ncclIbSendFifo* localElem = comm->remFifo.elems + slot; - localElem->addr = addr; - localElem->rkey = rkey; - localElem->ready = 1; - localElem->size = size; // Sanity/Debugging - localElem->seq = comm->remFifo.tail; // Sanity/Debugging - wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*sizeof(struct ncclIbSendFifo); + int slot = comm->remFifo.fifoTail%MAX_REQUESTS; + struct ncclIbSendFifo* localElem = comm->remFifo.elems[slot]; + + for (int i=0; irkey; + localElem[i].nreqs = n; + localElem[i].size = sizes[i]; // Sanity/Debugging + localElem[i].tag = tags[i]; + localElem[i].idx = comm->remFifo.fifoTail+1; + } + + wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo); wr.wr.rdma.rkey = comm->remFifo.rkey; comm->remFifo.sge.addr = (uint64_t)localElem; + comm->remFifo.sge.length = n*sizeof(struct ncclIbSendFifo); wr.sg_list = &comm->remFifo.sge; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_WRITE; @@ -796,92 +1096,107 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t // if (slot == 0) { wr.send_flags |= IBV_SEND_SIGNALED; - wr.wr_id = (uint64_t)req; + wr.wr_id = req - comm->verbs.reqs; req->events++; } struct ibv_send_wr* bad_wr; NCCLCHECK(wrap_ibv_post_send(comm->qps[0], &wr, &bad_wr)); - comm->remFifo.tail++; + comm->remFifo.fifoTail++; return ncclSuccess; } -ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { +ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm)); if (comm->ready == 0) { *request = NULL; return ncclSuccess; } - - struct ibv_mr* mr = (struct ibv_mr*)mhandle; + if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); - req->size = size; - req->addr = &comm->addr; + req->type = NCCL_NET_IB_REQ_RECV; + req->addr = &comm->sock.addr; + req->nreqs = n; + for (int i=0; irecv.sizes[i] = 0; struct ibv_recv_wr wr; memset(&wr, 0, sizeof(wr)); - wr.wr_id = (uint64_t)req; + wr.wr_id = req - comm->verbs.reqs; wr.sg_list = NULL; wr.num_sge = 0; + TIME_START(1); for (int q=0; qnqps; q++) { struct ibv_qp* qp = comm->qps[q]; struct ibv_recv_wr* bad_wr; NCCLCHECK(wrap_ibv_post_recv(qp, &wr, &bad_wr)); } + TIME_STOP(1); req->events = comm->nqps; *request = req; // Post to FIFO to notify sender - NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size, req)); + TIME_START(2); + NCCLCHECK(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req)); + TIME_STOP(2); return ncclSuccess; } -ncclResult_t ncclIbIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { +ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; - if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess; + int last = -1; + for (int i=0; igpuFlush.enabled == 0 || last == -1) return ncclSuccess; + // Only flush once using the last non-zero receive struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); - req->addr = &comm->addr; - struct ibv_mr* mr = (struct ibv_mr*)mhandle; + req->type = NCCL_NET_IB_REQ_FLUSH; + req->addr = &comm->sock.addr; + struct ibv_mr* mr = (struct ibv_mr*)mhandles[last]; struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); - wr.wr_id = (uint64_t)req; + wr.wr_id = req - comm->verbs.reqs; - wr.wr.rdma.remote_addr = (uint64_t)data; + wr.wr.rdma.remote_addr = (uint64_t)data[last]; wr.wr.rdma.rkey = mr->rkey; wr.sg_list = &comm->gpuFlush.sge; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_READ; wr.send_flags = IBV_SEND_SIGNALED; + TIME_START(4); struct ibv_send_wr* bad_wr; NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr)); + TIME_STOP(4); *request = req; return ncclSuccess; } -ncclResult_t ncclIbTest(void* request, int* done, int* size) { +ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { struct ncclIbRequest *r = (struct ncclIbRequest*)request; *done = 0; while (1) { if (r->events == 0) { *done = 1; - if (size) *size = r->size; + if (sizes && r->type == NCCL_NET_IB_REQ_RECV) { + for (int i=0; inreqs; i++) sizes[i] = r->recv.sizes[i]; + } NCCLCHECK(ncclIbFreeRequest(r)); return ncclSuccess; } int wrDone = 0; struct ibv_wc wcs[4]; + TIME_START(3); NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone)); + if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); } if (wrDone == 0) return ncclSuccess; for (int w=0; wstatus != IBV_WC_SUCCESS) { char line[SOCKET_NAME_MAXLEN+1]; WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d", - socketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err); + ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err); return ncclSystemError; } - struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc->wr_id; - if (doneReq) { - if (wc->opcode == IBV_WC_RECV) { - doneReq->size = wc->byte_len; -#if USE_RDMA_WRITE - } else if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { - doneReq->size = wc->imm_data; -#endif + struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff); + if (req->type == NCCL_NET_IB_REQ_SEND) { + for (int i=0; inreqs; i++) { + struct ncclIbRequest* sendReq = r->verbs->reqs+((wc->wr_id >> (i*8)) & 0xff); + if ((sendReq->events <= 0)) return ncclInternalError; + sendReq->events--; } - doneReq->events--; + } else { + if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { + if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError; + if (req->nreqs > 1) { + // In the case of a multi recv, we only set sizes to 0 or 1. + uint8_t* sizes = (uint8_t*)&wc->imm_data; + for (int i=0; inreqs; i++) { + req->recv.sizes[i] |= sizes[i]; + } + } else { + req->recv.sizes[0] += wc->imm_data; + } + } + req->events--; } } } @@ -911,20 +1237,21 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) { ncclResult_t ncclIbCloseSend(void* sendComm) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm) { - close(comm->fd); + close(comm->sock.fd); for (int q=0; qnqps; q++) if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q])); if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr)); NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs)); free(comm); } + TIME_PRINT("IB"); return ncclSuccess; } ncclResult_t ncclIbCloseRecv(void* recvComm) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm) { - close(comm->fd); + close(comm->sock.fd); for (int q=0; qnqps; q++) if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q])); if (comm->gpuFlush.enabled) { @@ -941,7 +1268,7 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) { ncclResult_t ncclIbCloseListen(void* listenComm) { struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm; if (comm) { - close(comm->fd); + close(comm->sock.fd); free(comm); } return ncclSuccess; @@ -965,3 +1292,4 @@ ncclNet_t ncclNetIb = { ncclIbCloseRecv, ncclIbCloseListen }; + diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index c045a8f91d..a8f69aa5f7 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -19,7 +19,7 @@ /* Init functions */ static int ncclNetIfs = -1; struct ncclSocketDev { - union socketAddress addr; + union ncclSocketAddress addr; char devName[MAX_IF_NAME_SIZE]; char* pciPath; }; @@ -40,8 +40,8 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { pthread_mutex_lock(&ncclSocketLock); if (ncclNetIfs == -1) { char names[MAX_IF_NAME_SIZE*MAX_IFS]; - union socketAddress addrs[MAX_IFS]; - ncclNetIfs = findInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS); + union ncclSocketAddress addrs[MAX_IFS]; + ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS); if (ncclNetIfs <= 0) { WARN("NET/Socket : no interface found"); return ncclInternalError; @@ -53,10 +53,10 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { addrline[SOCKET_NAME_MAXLEN] = '\0'; for (int i=0; iguid = dev; props->ptrSupport = NCCL_PTR_HOST; NCCLCHECK(ncclSocketGetSpeed(props->name, &props->speed)); + props->latency = 0; // Not set props->port = 0; props->maxComms = 65536; + props->maxRecvs = 1; return ncclSuccess; } -ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { +ncclResult_t GetSocketAddr(int dev, union ncclSocketAddress* addr) { if (dev >= ncclNetIfs) return ncclInternalError; memcpy(addr, &ncclSocketDevs[dev].addr, sizeof(*addr)); return ncclSuccess; @@ -118,18 +120,33 @@ ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2); NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2); +enum ncclSocketCommState { + ncclSocketCommStateStart = 0, + ncclSocketCommStateConnect = 1, + ncclSocketCommStateAccept = 3, + ncclSocketCommStateSend = 4, + ncclSocketCommStateRecv = 5, +}; + +struct ncclSocketCommStage { + enum ncclSocketCommState state; + uint8_t iteration; + struct ncclSocket* sock; + struct ncclSocketComm* comm; +}; + struct ncclSocketHandle { - union socketAddress connectAddr; + union ncclSocketAddress connectAddr; int nSocks; int nThreads; + struct ncclSocketCommStage stage; }; struct ncclSocketTask { int op; void* data; int size; - int fd; - union socketAddress *addr; + struct ncclSocket* sock; int offset; int used; ncclResult_t result; @@ -139,8 +156,7 @@ struct ncclSocketRequest { int op; void* data; int size; - int ctrlFd; - union socketAddress *addr; + struct ncclSocket* ctrlSock; int offset; int used; struct ncclSocketComm* comm; @@ -154,29 +170,30 @@ struct ncclSocketTaskQueue { struct ncclSocketTask* tasks; }; -enum threadState {start, stop}; - struct ncclSocketThreadResources { struct ncclSocketTaskQueue threadTaskQueue; - enum threadState state; + int stop; struct ncclSocketComm* comm; pthread_mutex_t threadLock; pthread_cond_t threadCond; }; struct ncclSocketListenComm { - int fd; + struct ncclSocket sock; + struct ncclSocketCommStage stage; int nSocks; int nThreads; + int dev; }; struct ncclSocketComm { - int ctrlFd; - union socketAddress addr; - int fds[MAX_SOCKETS]; + struct ncclSocket ctrlSock; + struct ncclSocket socks[MAX_SOCKETS]; + int dev; + int cudaDev; int nSocks; int nThreads; - int nextFd; + int nextSock; struct ncclSocketRequest requests[MAX_REQUESTS]; pthread_t helperThread[MAX_THREADS]; struct ncclSocketThreadResources threadResources[MAX_THREADS]; @@ -185,7 +202,6 @@ struct ncclSocketComm { void* persistentSocketThread(void *args_) { struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_; struct ncclSocketComm* comm = resource->comm; - volatile enum threadState* state = &resource->state; struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue; int nSocksPerThread = comm->nSocks / comm->nThreads; while (1) { @@ -198,7 +214,7 @@ void* persistentSocketThread(void *args_) { for (int j=0; jtasks+i+j; if (r != NULL && r->used == 1 && r->offset < r->size) { - r->result = socketProgress(r->op, r->fd, r->addr, r->data, r->size, &r->offset); + r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset); if (r->result != ncclSuccess) { WARN("NET/Socket : socket progress error"); return NULL; @@ -211,12 +227,12 @@ void* persistentSocketThread(void *args_) { } if (idle) { pthread_mutex_lock(&resource->threadLock); - while (mark == myQueue->next && *state != stop) { // no new tasks, wait + while (mark == myQueue->next && resource->stop == 0) { // no new tasks, wait pthread_cond_wait(&resource->threadCond, &resource->threadLock); } pthread_mutex_unlock(&resource->threadLock); } - if (*state == stop) return NULL; + if (resource->stop) return NULL; } } @@ -271,17 +287,17 @@ end: ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) { NCCLCHECK(ncclCalloc(comm, 1)); - (*comm)->fd = -1; + (*comm)->sock.fd = -1; return ncclSuccess; } ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) { NCCLCHECK(ncclCalloc(comm, 1)); - (*comm)->ctrlFd = -1; + (*comm)->ctrlSock.fd = -1; for (int i=0; i < MAX_SOCKETS; i++) { - (*comm)->fds[i] = -1; + (*comm)->socks[i].fd = -1; } - (*comm)->nextFd = 0; + (*comm)->nextSock = 0; return ncclSuccess; } @@ -290,14 +306,18 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) { return ncclInternalError; } struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; - static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large"); + memset(handle, 0, sizeof(struct ncclSocketHandle)); + static_assert(sizeof(struct ncclSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large"); struct ncclSocketListenComm* comm; NCCLCHECK(ncclSocketNewListenComm(&comm)); - NCCLCHECK(GetSocketAddr(dev, &handle->connectAddr)); - NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr)); + NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr)); + NCCLCHECK(ncclSocketListen(&comm->sock)); + memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress)); NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads)); handle->nSocks = comm->nSocks; handle->nThreads = comm->nThreads; + comm->sock.asyncFlag = 1; + comm->dev = dev; *listenComm = comm; return ncclSuccess; } @@ -306,38 +326,99 @@ ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) { if (dev < 0) { // data transfer socket is based on specified dev return ncclInternalError; } - struct ncclSocketComm* comm; - NCCLCHECK(ncclSocketNewComm(&comm)); + + enum ncclSocketState conState; struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; + struct ncclSocketCommStage* stage = &handle->stage; + struct ncclSocketComm* comm = stage->comm; + uint8_t i = stage->iteration; + struct ncclSocket* sock = stage->sock; + *sendComm = NULL; + + if (stage->state == ncclSocketCommStateConnect) goto socket_connect_check; + if (stage->state == ncclSocketCommStateSend) goto socket_send; + + NCCLCHECK(ncclSocketNewComm(&comm)); + stage->comm = comm; comm->nSocks = handle->nSocks; comm->nThreads = handle->nThreads; - for (int i=0; inSocks+1; i++) { - int tmpFd, offset=0; - NCCLCHECK(connectAddress(&tmpFd, &handle->connectAddr)); - NCCLCHECK(socketWait(NCCL_SOCKET_SEND, tmpFd, &handle->connectAddr, &i, sizeof(int), &offset)); - if (i == comm->nSocks) comm->ctrlFd = tmpFd; - else comm->fds[i] = tmpFd; + comm->dev = dev; + CUDACHECK(hipGetDevice(&comm->cudaDev)); + for (; inSocks+1; i++) { + sock = i == comm->nSocks ? &comm->ctrlSock : comm->socks+i; + NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, NULL, 1)); + + stage->sock = sock; + stage->state = ncclSocketCommStateConnect; + stage->iteration = i; + NCCLCHECK(ncclSocketConnect(sock)); + +socket_connect_check: + NCCLCHECK(ncclGetSocketState(sock, &conState)); + if (conState == ncclSocketConnecting) { + /* expect user to call again */ + return ncclSuccess; + } else if (conState == ncclSocketError) { + return ncclSystemError; + } + stage->state = ncclSocketCommStateSend; + +socket_send: + int done = 0; + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &i, sizeof(uint8_t), &done)); + if (done == 0) return ncclSuccess; } *sendComm = comm; - comm->addr = handle->connectAddr; return ncclSuccess; } ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) { struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm; - struct ncclSocketComm* rComm; + struct ncclSocketCommStage* stage = &lComm->stage; + struct ncclSocketComm* rComm = stage->comm; + uint8_t i = stage->iteration; + struct ncclSocket* sock = stage->sock; + + *recvComm = NULL; + if (stage->state == ncclSocketCommStateAccept) goto socket_accept; + if (stage->state == ncclSocketCommStateRecv) goto socket_recv; + NCCLCHECK(ncclSocketNewComm(&rComm)); + stage->comm = rComm; rComm->nSocks = lComm->nSocks; rComm->nThreads = lComm->nThreads; - for (int i=0; inSocks+1; i++) { - int tmpFd, sendSockIdx, offset=0; - socklen_t socklen = sizeof(union socketAddress); - SYSCHECKVAL(accept(lComm->fd, &rComm->addr.sa, &socklen), "accept", tmpFd); - NCCLCHECK(socketWait(NCCL_SOCKET_RECV, tmpFd, &rComm->addr, &sendSockIdx, sizeof(int), &offset)); - if (sendSockIdx == rComm->nSocks) rComm->ctrlFd = tmpFd; - else rComm->fds[sendSockIdx] = tmpFd; + rComm->dev = lComm->dev; + CUDACHECK(hipGetDevice(&rComm->cudaDev)); + lComm->sock.asyncFlag = 1; + for (; inSocks+1; i++) { + uint8_t sendSockIdx; + ncclCalloc(&sock, 1); + NCCLCHECK(ncclSocketInit(sock, NULL, NULL, 1)); + stage->sock = sock; + stage->state = ncclSocketCommStateAccept; + stage->iteration = i; +socket_accept: + NCCLCHECK(ncclSocketAccept(sock, &lComm->sock)); + if (sock->fd == -1) return ncclSuccess; + + stage->state = ncclSocketCommStateRecv; +socket_recv: + int done = 0; + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &sendSockIdx, sizeof(uint8_t), &done)); + if (done == 0) return ncclSuccess; + + if (sendSockIdx == rComm->nSocks) memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket)); + else memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket)); + + free(sock); } *recvComm = rComm; + + /* reset lComm state */ + stage->state = ncclSocketCommStateStart; + stage->iteration = 0; + stage->sock = NULL; + stage->comm = NULL; return ncclSuccess; } @@ -348,8 +429,7 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat r->op = op; r->data = data; r->size = size; - r->ctrlFd = comm->ctrlFd; - r->addr = &comm->addr; + r->ctrlSock = &comm->ctrlSock; r->used = 1; r->comm = comm; r->nSubs = 0; @@ -362,7 +442,7 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat } ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) { - int tid = comm->nextFd % comm->nThreads; + int tid = comm->nextSock % comm->nThreads; struct ncclSocketThreadResources* res = comm->threadResources+tid; struct ncclSocketTaskQueue* queue = &res->threadTaskQueue; // create helper threads and prepare per-thread task queue @@ -377,22 +457,21 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, pthread_mutex_init(&res->threadLock, NULL); pthread_cond_init(&res->threadCond, NULL); pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res); + ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev); } struct ncclSocketTask* r = queue->tasks+queue->next; if (r->used == 0) { r->op = op; r->data = data; r->size = size; - r->fd = comm->fds[comm->nextFd]; - r->addr = &comm->addr; + r->sock = comm->socks+comm->nextSock; r->offset = 0; r->result = ncclSuccess; - comm->nextFd = (comm->nextFd + 1) % comm->nSocks; + comm->nextSock = (comm->nextSock + 1) % comm->nSocks; r->used = 1; *req = r; pthread_mutex_lock(&res->threadLock); queue->next = (queue->next+1)%queue->len; - res->state = start; pthread_cond_signal(&res->threadCond); pthread_mutex_unlock(&res->threadLock); return ncclSuccess; @@ -411,18 +490,20 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { if (r->used == 1) { /* try to send/recv size */ int data = r->size; int offset = 0; - NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->addr, &data, sizeof(int), &offset)); + NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, &data, sizeof(int), &offset)); if (offset == 0) return ncclSuccess; /* Not ready -- retry later */ // Not sure we could ever receive less than 4 bytes, but just in case ... - if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->ctrlFd, r->addr, &data, sizeof(int), &offset)); + if (offset < sizeof(int)) NCCLCHECK(ncclSocketWait(r->op, r->ctrlSock, &data, sizeof(int), &offset)); // Check size is less or equal to the size provided by the user if (r->op == NCCL_SOCKET_RECV && data > r->size) { char line[SOCKET_NAME_MAXLEN+1]; - WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", socketToString(r->addr, line), data, r->size); - return ncclInternalError; + WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \ + there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks", + ncclSocketToString(&r->ctrlSock->addr, line), data, r->size); + return ncclInvalidUsage; } r->size = data; r->offset = 0; @@ -459,7 +540,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { } } else { // progress request using main thread if (r->offset < r->size) { - NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->addr, r->data, r->size, &r->offset)); + NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, r->data, r->size, &r->offset)); } if (r->offset == r->size) { if (size) *size = r->size; @@ -476,19 +557,20 @@ ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** } ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } -ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { +ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm; NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request)); return ncclSuccess; } -ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { +ncclResult_t ncclSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm; - NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data, size, (struct ncclSocketRequest**)request)); + if (n != 1) return ncclInternalError; + NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclSocketRequest**)request)); return ncclSuccess; } -ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { +ncclResult_t ncclSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { // We don't support CUDA pointers, so we don't need a flush operation return ncclInternalError; } @@ -496,7 +578,7 @@ ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandl ncclResult_t ncclSocketCloseListen(void* opaqueComm) { struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm; if (comm) { - if (comm->fd != -1) close(comm->fd); + if (comm->sock.fd != -1) close(comm->sock.fd); free(comm); } return ncclSuccess; @@ -509,16 +591,16 @@ ncclResult_t ncclSocketClose(void* opaqueComm) { struct ncclSocketThreadResources* res = comm->threadResources+i; if (comm->helperThread[i]) { pthread_mutex_lock(&res->threadLock); - res->state = stop; + res->stop = 1; pthread_cond_signal(&res->threadCond); pthread_mutex_unlock(&res->threadLock); pthread_join(comm->helperThread[i], NULL); } free(res->threadTaskQueue.tasks); } - if (comm->ctrlFd != -1) close(comm->ctrlFd); + if (comm->ctrlSock.fd != -1) close(comm->ctrlSock.fd); for (int i=0; inSocks; i++) { - if (comm->fds[i] != -1) close(comm->fds[i]); + if (comm->socks[i].fd != -1) close(comm->socks[i].fd); } free(comm); } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index d98e18c8bc..c6513c5c1d 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,32 +8,30 @@ #include "comm.h" #include "graph.h" #include "utils.h" -#include "bootstrap.h" -struct p2pConnectInfo { - int rank; - int read; +struct ncclP2pBuff { void* directPtr; hipIpcMemHandle_t devIpc; }; +struct p2pConnectInfo { + int rank; + int read; + struct ncclP2pBuff p2pBuff; +}; +static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large"); + struct p2pSendResources { struct ncclSendMem* devMem; - void* ipcPtr; uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only) - int remoteId; - int memRank; - void* remIpcPtr; - void* bootstrap; + void* sendMemIpc; + void* recvMemIpc; }; struct p2pRecvResources { struct ncclRecvMem* devMem; - void* ipcPtr; - int remoteId; - int memRank; - void* remIpcPtr; - void* bootstrap; + void* sendMemIpc; + void* recvMemIpc; }; #include @@ -103,15 +101,22 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop #else // Check that legacy IPC support is available if (p2p != 0) { + // Cached result of the legacyIPC detection + static int legacyIPC = -1; + if (legacyIPC >= 0) { + *ret = legacyIPC; + return ncclSuccess; + } + // Check that legacy IPC support is available (WSL WAR) char *dummy; - cudaIpcMemHandle_t ipc; + hipIpcMemHandle_t ipc; NCCLCHECK(ncclCudaCalloc(&dummy, CUDA_IPC_MIN)); - if (cudaIpcGetMemHandle(&ipc, dummy) != cudaSuccess) { - INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported on dev %d(=%lx)", - cudaDev1, info1->busId); + if (hipIpcGetMemHandle(&ipc, dummy) != hipSuccess) { + INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported"); *ret = 0; } - CUDACHECK(cudaFree(dummy)); + CUDACHECK(hipFree(dummy)); + legacyIPC = *ret; return ncclSuccess; } #endif @@ -132,6 +137,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \ } while (0) + // Setting this to non zero causes P2P to use Reads rather than Writes NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2); @@ -146,7 +152,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* return ncclSuccess; } -static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct p2pConnectInfo* p2pInfo, void** devMem, void** ipcPtr) { +static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) { if (myInfo->pidHash == peerInfo->pidHash) { if (peerInfo->cudaDev != myInfo->cudaDev) { // Enable P2P access @@ -159,10 +165,10 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee return ncclInternalError; } } - *devMem = p2pInfo->directPtr; + *devMem = p2pBuff->directPtr; *ipcPtr = NULL; } else { - CUDACHECK(hipIpcOpenMemHandle(devMem, p2pInfo->devIpc, hipIpcMemLazyEnablePeerAccess)); + CUDACHECK(hipIpcOpenMemHandle(devMem, p2pBuff->devIpc, hipIpcMemLazyEnablePeerAccess)); *ipcPtr = *devMem; } return ncclSuccess; @@ -188,44 +194,40 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg); } - struct p2pConnectInfo info; - // For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) - info.read = (connIndex == 0) ? useRead : 0; - const char* useReadStr = info.read ? "/read" : ""; + static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); + struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; + info->read = useRead; + // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) + if (graph && connIndex == 1) info->read = 0; + const char* useReadStr = info->read ? "/read" : ""; int sendSize = sizeof(struct ncclSendMem); // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure - if (info.read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE]; + if (info->read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE]; ALIGN_SIZE(sendSize, CUDA_IPC_MIN); - resources->remoteId = -1; - resources->bootstrap = comm->bootstrap; if (intermediateRank == -1) { - NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, sendSize, true)); - info.rank = myInfo->rank; + info->rank = myInfo->rank; if (myInfo->pidHash == peerInfo->pidHash) { - send->conn.direct |= info.read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; + send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); } else { - send->conn.direct |= info.read ? NCCL_IPC_READ : NCCL_IPC_WRITE; - CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr)); + send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); } } else { - NCCLCHECK(bootstrapRemAlloc(sendSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr)); - info.rank = intermediateRank; - INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s comm %p nRanks %02d", + info->rank = intermediateRank; + INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank, - comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks); + comm->peerInfo[intermediateRank].busId, useReadStr); } - resources->memRank = info.rank; - NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn)); + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); - static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); - memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo)); + NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc)); return ncclSuccess; } @@ -238,36 +240,32 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st int useRead, intermediateRank; NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); - struct p2pConnectInfo info; - // For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) - info.read = (connIndex == 0) ? useRead : 0; + static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); + struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; + info->read = useRead; + // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) + if (graph && connIndex == 1) info->read = 0; - int recvSize = offsetof(struct ncclRecvMem, buff); + int recvSize = sizeof(struct ncclRecvMem); // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure - for (int p=0; pcomm->buffSizes[p]; + for (int p=0; pread && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p]; ALIGN_SIZE(recvSize, CUDA_IPC_MIN); - resources->remoteId = -1; - resources->bootstrap = comm->bootstrap; if (intermediateRank == -1) { - NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, recvSize, true)); - info.rank = myInfo->rank; + info->rank = myInfo->rank; if (myInfo->pidHash == peerInfo->pidHash) { - recv->conn.direct |= info.read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; + recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; } else { - recv->conn.direct |= info.read ? NCCL_IPC_READ : NCCL_IPC_WRITE; - CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr)); + recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; } } else { - NCCLCHECK(bootstrapRemAlloc(recvSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr)); - info.rank = intermediateRank; + info->rank = intermediateRank; } - resources->memRank = info.rank; - NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn)); + NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); - static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); - memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo)); + NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc)); return ncclSuccess; } @@ -277,16 +275,16 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co struct ncclRecvMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; - NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->remIpcPtr)); + NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc)); - int offset = 0; + char* buff = (char*)(remDevMem+1); for (int p=0; pread && p == NCCL_PROTO_SIMPLE) { /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */ - send->conn.buffs[p] = resources->devMem->buff; + send->conn.buffs[p] = (char*)(resources->devMem+1); } else { - send->conn.buffs[p] = remDevMem->buff + offset; - offset += send->comm->buffSizes[p]; + send->conn.buffs[p] = buff; + buff += send->comm->buffSizes[p]; } } send->conn.tail = &remDevMem->tail; @@ -303,16 +301,16 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn struct ncclSendMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; - NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->remIpcPtr)); + NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc)); - int offset = 0; + char* buff = (char*)(resources->devMem+1); for (int p=0; pread && p == NCCL_PROTO_SIMPLE) { /* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */ - recv->conn.buffs[p] = remDevMem->buff; + recv->conn.buffs[p] = (char*)(remDevMem+1); } else { - recv->conn.buffs[p] = resources->devMem->buff + offset; - offset += recv->comm->buffSizes[p]; + recv->conn.buffs[p] = buff; + buff += recv->comm->buffSizes[p]; } } recv->conn.tail = &resources->devMem->tail; @@ -322,39 +320,49 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn return ncclSuccess; } -ncclResult_t p2pSendFree(void* resources) { - struct p2pSendResources* sendRes = (struct p2pSendResources*)resources; - if (sendRes->ipcPtr) - CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr)); - if (sendRes->remIpcPtr) - CUDACHECK(hipIpcCloseMemHandle(sendRes->remIpcPtr)); - if (sendRes->remoteId != -1) { - NCCLCHECK(bootstrapRemFree(sendRes->remoteId, sendRes->memRank, sendRes->bootstrap)); - sendRes->devMem = NULL; - } - CUDACHECK(hipFree(sendRes->devMem)); - free(sendRes); +ncclResult_t p2pSendFree(struct ncclConnector* send) { + struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources; + if (resources->sendMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->sendMemIpc)); + if (resources->recvMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->recvMemIpc)); + free(resources); return ncclSuccess; } -ncclResult_t p2pRecvFree(void* resources) { - struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources; - if (recvRes->ipcPtr) - CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr)); - if (recvRes->remIpcPtr) - CUDACHECK(hipIpcCloseMemHandle(recvRes->remIpcPtr)); - if (recvRes->remoteId != -1) { - NCCLCHECK(bootstrapRemFree(recvRes->remoteId, recvRes->memRank, recvRes->bootstrap)); - recvRes->devMem = NULL; +ncclResult_t p2pRecvFree(struct ncclConnector* recv) { + struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; + if (resources->sendMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->sendMemIpc)); + if (resources->recvMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->recvMemIpc)); + free(resources); + return ncclSuccess; +} + +static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + if (reqSize != sizeof(int)) return ncclInternalError; + int size = *((int*)reqBuff); + if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError; + struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff; + NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size, true)); + connection->transportResources = p2pBuff->directPtr; + hipError_t res = hipIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr); + if (res != hipSuccess) { + WARN("hipIpcGetMemHandle failed : %s", hipGetErrorString(res)); + hipFree(p2pBuff->directPtr); + free(p2pBuff); + CUDACHECK(res); } - CUDACHECK(hipFree(recvRes->devMem)); - free(recvRes); + *done = 1; + return ncclSuccess; +} + +static ncclResult_t p2pProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + // Do not check return code as CUDA may have already shut down + hipFree(connection->transportResources); return ncclSuccess; } struct ncclTransport p2pTransport = { "P2P", p2pCanConnect, - { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL }, - { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL } + { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL }, + { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL } }; diff --git a/src/transport/shm.cc b/src/transport/shm.cc index af20188981..974a2ab621 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -1,6 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,12 +8,10 @@ #include "shm.h" struct shmConnectInfo { - uint64_t pidHash; - int id; - int sendRank; - int recvRank; + char shmName[7]; int shmSize; }; +static_assert(sizeof(shmConnectInfo) <= CONNECT_SIZE, "SHM Connect info is too large"); struct shmSendResources { int remShmSize; @@ -63,22 +60,17 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; - struct shmConnectInfo info; - info.id = channelId; - info.pidHash = myInfo->pidHash; - info.sendRank = myInfo->rank; - info.recvRank = peerInfo->rank; + static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big"); + struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); - info.shmSize = resources->shmSize = sizeof(struct ncclSendMem); - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); - NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); + char shmPath[PATH_MAX]; + shmPath[0] = '\0'; + info->shmSize = resources->shmSize = sizeof(struct ncclSendMem); + NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); + TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize); + memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName)); - INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory comm %p nRanks %02d", channelId, myInfo->rank, - myInfo->busId, peerInfo->rank, peerInfo->busId, comm, comm->nRanks); - static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); - memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); + INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); return ncclSuccess; } @@ -87,22 +79,18 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; - struct shmConnectInfo info; - info.id = channelId; - info.pidHash = myInfo->pidHash; - info.sendRank = peerInfo->rank; - info.recvRank = myInfo->rank; + static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big"); + struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); - int shmSize = offsetof(struct ncclRecvMem, buff); + char shmPath[PATH_MAX]; + shmPath[0] = '\0'; + int shmSize = sizeof(struct ncclRecvMem); for (int p=0; pcomm->buffSizes[p]; - info.shmSize = resources->shmSize = shmSize; - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); - NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); + info->shmSize = resources->shmSize = shmSize; + NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); + TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize); + memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName)); - static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big"); - memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); return ncclSuccess; } @@ -112,18 +100,18 @@ ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectIn struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank); + char shmPath[PATH_MAX]; + sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); resources->remShmSize = info->shmSize; - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); - NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); + NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); // Remove the file to ensure proper clean-up - NCCLCHECK(shmUnlink(shmName)); + NCCLCHECK(ncclShmUnlink(shmPath)); send->transportResources = resources; int offset = 0; for (int p=0; pconn.buffs[p] = resources->devRemHostMem->buff + offset; + send->conn.buffs[p] = (char*)(resources->devRemHostMem+1) + offset; offset += send->comm->buffSizes[p]; } send->conn.tail = &resources->devRemHostMem->tail; @@ -137,35 +125,35 @@ ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank); + char shmPath[PATH_MAX]; + sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); resources->remShmSize = info->shmSize; - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); - NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); - NCCLCHECK(shmUnlink(shmName)); + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); + NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); + NCCLCHECK(ncclShmUnlink(shmPath)); recv->conn.head = &resources->devRemHostMem->head; int offset = 0; for (int p=0; pconn.buffs[p] = resources->devHostMem->buff + offset; + recv->conn.buffs[p] = (char*)(resources->devHostMem+1) + offset; offset += recv->comm->buffSizes[p]; } recv->conn.tail = &resources->devHostMem->tail; return ncclSuccess; } -ncclResult_t shmSendFree(void* transportResources) { - struct shmSendResources* resources = (struct shmSendResources*)transportResources; - NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); - NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); +ncclResult_t shmSendFree(struct ncclConnector* send) { + struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources; + NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); + NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); free(resources); return ncclSuccess; } -ncclResult_t shmRecvFree(void* transportResources) { - struct shmRecvResources* resources = (struct shmRecvResources*)transportResources; - NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); - NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); +ncclResult_t shmRecvFree(struct ncclConnector* recv) { + struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; + NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); + NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); free(resources); return ncclSuccess; } @@ -173,6 +161,6 @@ ncclResult_t shmRecvFree(void* transportResources) { struct ncclTransport shmTransport = { "SHM", shmCanConnect, - { shmSendSetup, shmSendConnect, shmSendFree, NULL }, - { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL } + { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL }, + { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL } }; diff --git a/tools/topo_expl/include/utils.h b/tools/topo_expl/include/utils.h index d249dd3697..068d669595 100644 --- a/tools/topo_expl/include/utils.h +++ b/tools/topo_expl/include/utils.h @@ -8,12 +8,6 @@ #ifndef UTILS_H_ #define UTILS_H_ -struct allGather1Data_t { - struct ncclPeerInfo peerInfo; - struct ncclComm* comm; - int cudaCompCap; -}; - // AllGather3 - begin struct ncclGraphInfo { int pattern; @@ -26,6 +20,7 @@ struct ncclGraphInfo { }; struct allGather3Data_t{ + int netDev; int collNetSupport; int nc; struct ncclGraphInfo tree; @@ -40,9 +35,9 @@ ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem** ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem); -ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data); +ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash); -ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, struct allGather3Data_t *allGather3Data, +ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph); ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, diff --git a/tools/topo_expl/model.cpp b/tools/topo_expl/model.cpp index 3285bb68f4..969ffbc6c3 100644 --- a/tools/topo_expl/model.cpp +++ b/tools/topo_expl/model.cpp @@ -66,6 +66,10 @@ ncclNet_t ncclNetDummy = { ncclNet_t* ncclNet = &ncclNetDummy; +int ncclNetVersion() { + return 4; +} + /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ int busIdToCudaDev(int64_t busId) { return node_model->busIdToCudaDev(busId); @@ -142,6 +146,19 @@ struct ncclTransport shmTransport = { { shmRecvSetup, NULL, NULL, NULL } }; +NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2); + +struct setupReq { + int rank; + int localRank; + int remoteRank; + int shared; + int netDev; + int useGdr; + int channelId; + int connIndex; +}; + /* Determine if two peers can communicate with NET */ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = node_model->netCanConnect(info1->rank, info2->rank); @@ -149,38 +166,47 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop } ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { - int netDev, useGdr = 0; + struct setupReq req; - netDev = -1; - if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 1, &netDev)); - if (netDev < 0) { - // Send/Receive: Round-robin NICs based on the receiver's CUDA device - int nicRR = comm->peerInfo[peerInfo->rank].cudaDev; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &netDev)); + send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; + req.channelId = channelId; + req.connIndex = connIndex; + req.netDev = -1; + + int proxyRank = myInfo->rank; + if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 1, &req.netDev)); + if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); + + if (proxyRank == myInfo->rank) { + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev, + req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); + } else { + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev, + proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 1, &useGdr)); - - INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), netDev, - useGdr ? "/GDRDMA" : ""); + *((int*)connectInfo) = proxyRank; return ncclSuccess; } NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB); ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { - int netDev, useGdr = 0; + struct setupReq req; - netDev = -1; - if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &netDev)); - if (netDev < 0) { - // Send/Receive: Round-robin NICs based on the receiver's CUDA device - int nicRR = comm->cudaDev; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &netDev)); - } - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 0, &useGdr)); + recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; + req.channelId = channelId; + req.connIndex = connIndex; + req.netDev = -1; - INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), netDev, - useGdr ? "/GDRDMA" : ""); + // Use myInfo->rank as the receiver uses its own NIC + int proxyRank = myInfo->rank; + if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &req.netDev)); + if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr)); + + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev, + req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); return ncclSuccess; } @@ -198,9 +224,9 @@ ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncc } ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { - int netDev, useGdr = 0; + int netDev, useGdr = 0, proxy; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &netDev)); + NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netDev, &proxy)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 1, &useGdr)); INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, "SHARP", netDev, useGdr ? "/GDRDMA" : ""); @@ -208,9 +234,9 @@ ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph } ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { - int netDev, useGdr = 0; + int netDev, useGdr = 0, proxy; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &netDev)); + NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netDev, &proxy)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 0, &useGdr)); INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, "SHARP", netDev, useGdr ? "/GDRDMA" : ""); diff --git a/tools/topo_expl/topo_expl.cpp b/tools/topo_expl/topo_expl.cpp index da70b9c021..e4fe37e251 100644 --- a/tools/topo_expl/topo_expl.cpp +++ b/tools/topo_expl/topo_expl.cpp @@ -195,12 +195,17 @@ int main(int argc,char* argv[]) NCCLCHECK(ncclCalloc(&comm, nranks)); - struct allGather1Data_t *allGather1Data; - NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); + struct ncclPeerInfo *peerInfo; + NCCLCHECK(ncclCalloc(&peerInfo, nranks+1)); // Extra rank to represent CollNet root struct allGather3Data_t *allGather3Data; NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); + struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph; + NCCLCHECK(ncclCalloc(&treeGraph, nranks)); + NCCLCHECK(ncclCalloc(&ringGraph, nranks)); + NCCLCHECK(ncclCalloc(&collNetGraph, nranks)); + for (int i = 0; i < nranks; i++) { comm[i].rank = i; comm[i].nRanks = nranks; @@ -211,22 +216,18 @@ int main(int argc,char* argv[]) NCCLCHECK(ncclCalloc(&comm[i].p2pRecvs, comm->nRanks)); node_model = network.GetNode(i); assert(node_model!=0); + comm[i].busId = node_model->getGpuBusId(i); comm[i].topo = node_model->getSystem(i); - bootstrapAllGather(&comm[i], allGather1Data); + comm[i].peerInfo = peerInfo; // Mark channels as non initialized. for (int c=0; cnRanks)); + NCCLCHECK(fillInfo(&comm[i], comm[i].peerInfo+comm[i].rank, 0)); } - struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph; - NCCLCHECK(ncclCalloc(&treeGraph, nranks)); - NCCLCHECK(ncclCalloc(&ringGraph, nranks)); - NCCLCHECK(ncclCalloc(&collNetGraph, nranks)); for (int i = 0; i < nranks; i++) { node_model = network.GetNode(i); assert(node_model!=0); - initTransportsRank_1(&comm[i], allGather1Data, allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i]); + initTransportsRank_1(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i]); } for (int i = 0; i < nranks; i++) { @@ -246,7 +247,7 @@ int main(int argc,char* argv[]) free(ringGraph); free(collNetGraph); free(allGather3Data); - free(allGather1Data); + free(peerInfo); free(comm); printf("Done generating topology using %d: %s\n", model_id, desc->description); diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp index 05285892d0..42f93cad02 100644 --- a/tools/topo_expl/utils.cpp +++ b/tools/topo_expl/utils.cpp @@ -37,10 +37,8 @@ const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; extern NodeModel *node_model; -NCCL_PARAM(CrossNic, "CROSS_NIC", 2); NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0); NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0); -RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0); NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2); thread_local int ncclDebugNoWarn = 0; @@ -111,11 +109,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file vsprintf(buffer+len, fmt, args); va_end(args); printf("%s\n", buffer); +#if 0 if (level == NCCL_LOG_WARN) { fprintf(stderr,"[%d:%d] %s:%d TOPO EXPL ABORT\n", node_model->nodeId, node_model->currRank, filefunc, line); abort(); } +#endif } ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem** system) { @@ -128,20 +128,6 @@ ncclResult_t ncclTopoGetSystem(const char* xmlTopoFile, struct ncclTopoSystem** } -ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data) { - // AllGather1 - begin - allGather1Data[comm->rank].comm = comm; - allGather1Data[comm->rank].cudaCompCap = 1; - allGather1Data[comm->rank].peerInfo.rank = comm->rank; - allGather1Data[comm->rank].peerInfo.cudaDev = node_model->rankToCudaDev(comm->rank); - allGather1Data[comm->rank].peerInfo.gdrSupport = 1; - allGather1Data[comm->rank].peerInfo.hostHash = node_model->hostHash; - allGather1Data[comm->rank].peerInfo.pidHash = node_model->pidHash; - allGather1Data[comm->rank].peerInfo.shmDev = 0x19; - allGather1Data[comm->rank].peerInfo.busId = node_model->getGpuBusId(comm->rank); - return ncclSuccess; -} - void initCollNet() { if (ncclParamCollNetEnable() == 1 && ncclCollNet == 0) ncclCollNet = (ncclCollNet_t*)0x12345678; @@ -182,6 +168,30 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) { return ncclSuccess; } +ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) { + info->rank = comm->rank; + info->cudaDev = node_model->rankToCudaDev(comm->rank); + info->hostHash = node_model->hostHash; + info->pidHash = node_model->pidHash; + + // Get the device MAJOR:MINOR of /dev/shm so we can use that + // information to decide whether we can use SHM for inter-process + // communication in a container environment + //struct stat statbuf; + //SYSCHECK(stat("/dev/shm", &statbuf), "stat"); + info->shmDev = 0x19; + + info->busId = node_model->getGpuBusId(comm->rank); + + // detect if fine grained memory is available on this GPU + info->hasFineGrain = true; + info->gdrSupport = 1; + + info->comm = comm; + info->cudaCompCap = 1; + return ncclSuccess; +} + static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) { TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); NCCLCHECK(initChannel(comm, channelId)); @@ -230,8 +240,8 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* return ncclSuccess; } } - WARN("No transport found !"); - return ncclInternalError; + WARN("No transport found for rank %d[%lx] -> rank %d[%lx]", myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + return ncclSystemError; } ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) { @@ -250,12 +260,19 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* return ncclSuccess; } +void dumpData(struct ncclConnect* data, int ndata) { + for (int n=0; n= 11030 // Stream used during transport setup; need for P2P pre-connect + CUDA Graph - hipStream_t transportSetupStream; - CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking)); -#endif + //hipStream_t transportSetupStream; + //CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking)); int highestType = TRANSPORT_P2P; // track highest transport type struct ncclConnect data[2*MAXCHANNELS]; @@ -302,11 +319,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex; //NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn)); conn->connected = 1; -#if CUDART_VERSION >= 11030 //CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream)); -#else - //CUDACHECK(hipMemcpy(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); -#endif } } for (int c=0; cchannels[c].peers[recvPeer].recv + connIndex; //NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn)); conn->connected = 1; -#if CUDART_VERSION >= 11030 //CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream)); -#else - //CUDACHECK(hipMemcpy(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); -#endif } } comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0; } -#if CUDART_VERSION >= 11030 - CUDACHECK(hipStreamSynchronize(transportSetupStream)); - CUDACHECK(hipStreamDestroy(transportSetupStream)); -#endif + //CUDACHECK(hipStreamSynchronize(transportSetupStream)); + //CUDACHECK(hipStreamDestroy(transportSetupStream)); if (highestTransportType != NULL) *highestTransportType = highestType; return ncclSuccess; } @@ -422,9 +429,9 @@ cleanup: ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) { // AllGather collNet setup results - int allGatherFailures[NCCL_MAX_INTRA_RANKS] = {0}; - allGatherFailures[comm->intraNodeRank] = collNetSetupFail; - //NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, allGatherFailures, sizeof(int))); + int allGatherFailures[NCCL_MAX_LOCAL_RANKS] = {0}; + allGatherFailures[comm->localRank] = collNetSetupFail; + //NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, allGatherFailures, sizeof(int))); for (int i=0; ilocalRanks; i++) { if (allGatherFailures[i] != 0) { collNetSetupFail = 1; @@ -432,7 +439,7 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa } } if (collNetSetupFail) { - if (comm->intraNodeRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead"); + if (comm->localRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead"); return ncclSystemError; } return ncclSuccess; @@ -457,80 +464,33 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) { return ncclSuccess; } -ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, struct allGather3Data_t *allGather3Data, +ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) { + // We use 2 AllGathers + // 1. { peerInfo, comm, compCap} + // 2. { nChannels, graphInfo, topoRanks } + int rank = comm->rank; int nranks = comm->nRanks; //uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES); //TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); - //NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); + // [RCCL] Collect the PID of the root + int rootPid; + //NCCLCHECK(bootstrapInit(commId, comm)); + // [/RCCL] // AllGather1 - begin - //struct { - // struct ncclPeerInfo peerInfo; - // struct ncclComm* comm; - // int cudaCompCap; - //} *allGather1Data; + //NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root + //NCCLCHECK(fillInfo(comm, comm->peerInfo+rank, comm->rank)); + //NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo))); - //NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); - //allGather1Data[rank].comm = comm; - //allGather1Data[rank].cudaCompCap = ncclCudaCompCap(); - struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo; - //NCCLCHECK(fillInfo(comm, myInfo, commHash)); - //NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); - - NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root for (int i = 0; i < nranks; i++) { - memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); - if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) { - WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId); + if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) { + WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId); return ncclInvalidUsage; } } - // Compute intra ranks and minimum CUDA Compute capabilities of intra-node GPUs and all GPUs - int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0; - int intraNodeRank0 = -1, intraNodeRank = -1, intraNodeRanks = 0; - int myCompCap = allGather1Data[rank].cudaCompCap; - int minCompCap = myCompCap, maxCompCap = myCompCap; - for (int i = 0; i < nranks; i++) { - if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) { - // Rank is on same node - if (intraNodeRanks == 0) intraNodeRank0 = i; - if (i == rank) intraNodeRank = intraNodeRanks; - comm->intraNodeGlobalRanks[intraNodeRanks] = i; - comm->rankToIntraNodeRank[i] = intraNodeRanks; - intraNodeRanks++; - if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) { - // Rank is in same process - if (intraProcRanks == 0) intraProcRank0 = i; - if (i == rank) intraProcRank = intraProcRanks; - intraProcRanks++; - } - } - minCompCap = std::min(allGather1Data[i].cudaCompCap, minCompCap); - maxCompCap = std::max(allGather1Data[i].cudaCompCap, maxCompCap); - } - TRACE(NCCL_INIT,"hostHash[%d] %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d", - rank, allGather1Data[rank].peerInfo.hostHash, intraNodeRank, intraNodeRanks, intraNodeRank0); - TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", - rank, allGather1Data[rank].peerInfo.pidHash, intraProcRank, intraProcRanks, intraProcRank0); - if (intraProcRank == -1 || intraProcRank0 == -1 || allGather1Data[intraProcRank0].comm == NULL) { - WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", - rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash, - intraProcRank, intraProcRanks, intraProcRank0); - return ncclInternalError; - } - if (intraNodeRank == -1 || intraNodeRank0 == -1 || intraNodeRanks == 0) { - WARN("Failed to determine intra node ranks rank %d hostHash %lx pidHash %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d", - rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash, - intraNodeRank, intraNodeRanks, intraNodeRank0); - return ncclInternalError; - } - struct ncclComm* intraProcRank0Comm = allGather1Data[intraProcRank0].comm; - uint64_t intraNodeRank0pidHash = allGather1Data[intraNodeRank0].peerInfo.pidHash; - comm->intraNodeRank = intraNodeRank; - // AllGather1 - end // Topo detection / System graph creation @@ -550,11 +510,23 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t // Print final topology NCCLCHECK(ncclTopoPrint(comm->topo)); + // Set Affinity to a CPU local the our GPU, so that all memory we allocate + // on the host is local. + //NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity)); + //cpu_set_t affinitySave; + // if (CPU_COUNT(&comm->cpuAffinity)) { + //sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); + //sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + //} + ncclResult_t ret; + + // Launch proxy service thread + //NCCLCHECK(ncclProxyCreate(comm)); + // Get rings and trees //struct ncclTopoGraph ringGraph; ringGraph.id = 0; ringGraph.pattern = NCCL_TOPO_PATTERN_RING; - ringGraph.crossNic = ncclParamCrossNic(); ringGraph.collNet = 0; ringGraph.minChannels = 1; ringGraph.maxChannels = MAXCHANNELS/2; @@ -564,7 +536,6 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t //struct ncclTopoGraph treeGraph; treeGraph.id = 1; treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE; - treeGraph.crossNic = ncclParamCrossNic(); treeGraph.collNet = 0; treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels; treeGraph.maxChannels = ringGraph.nChannels; @@ -575,56 +546,55 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t collNetGraph.id = 2; collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; collNetGraph.collNet = 1; - collNetGraph.crossNic = ncclParamCrossNic(); - collNetGraph.minChannels = 1; - collNetGraph.maxChannels = ringGraph.nChannels; + collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels; NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph)); - bool allXgmi = true; + bool allXgmi = true, hasPeerAccess = true; + // Check that all the GPUs have peer access to one another and are XGMI connected + for (int i = 0; i < nranks && hasPeerAccess; i++) { + int cudaDev1 = comm->peerInfo[i].cudaDev; + for (int j = 0; j < nranks; j++) { + if (i == j) continue; + int cudaDev2 = comm->peerInfo[j].cudaDev; + int p2p; + if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p) + { + hasPeerAccess = false; + break; + } + + bool isXGMI; + // Limit to single intermediate GPU for enabling clique + NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI, 1)); + allXgmi &= isXGMI; + } + } + +#if 0 { // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager - //CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED; + CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED; if (comm->localRanks == comm->nRanks && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) { - // Check that all the GPUs have peer access to one another and are XGMI connected - bool hasPeerAccess = true; - for (int i = 0; i < nranks && hasPeerAccess; i++) + if (hasPeerAccess) { - int cudaDev1 = allGather1Data[i].peerInfo.cudaDev; - for (int j = 0; j < nranks; j++) - { - if (i == j) continue; - int cudaDev2 = allGather1Data[j].peerInfo.cudaDev; - //int p2p; - //if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p) - //{ - // hasPeerAccess = false; - // break; - //} - - bool isXGMI; - NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI, 1)); - allXgmi &= isXGMI; - } + if (intraProcRanks == nranks) + cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS; + else + cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE; } - //if (hasPeerAccess) - //{ - // if (intraRanks == nranks) - // cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS; - // else - // cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE; - //} // For now, only enable clique-based kernels on nodes where all GPUs are XGMI connected - //if (!allXgmi && !rcclParamCliqueIgnoreTopo()) - //{ - // INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (ignore with RCCL_CLIQUE_IGNORE_TOPO)"); - // cliqueMode = CliqueManager::CLIQUE_DISABLED; - //} + if (!allXgmi && !rcclParamCliqueIgnoreTopo()) + { + INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (ignore with RCCL_CLIQUE_IGNORE_TOPO)"); + cliqueMode = CliqueManager::CLIQUE_DISABLED; + } } - //comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode); - //NCCLCHECK(comm->cliqueManager->Init(commId, rootPid)); + comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode); + NCCLCHECK(comm->cliqueManager->Init(commId, rootPid)); } // [/RCCL] +#endif if (comm->rank == ncclParamGraphDumpFileRank()) { struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph }; @@ -633,19 +603,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t // Determine local CollNet support before all-gather if (ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1; - if (intraNodeRanks > 8) { - if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node"); - comm->collNetSupport = 0; - } - if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) { - if (rcclParamP2pNetDisable() == 0) { - comm->p2pNet = 1; - INFO(NCCL_INIT, "RCCL enabled same node P2P over network"); - } - else - INFO(NCCL_INIT, "RCCL force disabled same node P2P over network"); - } // AllGather3 - begin #if 0 struct ncclGraphInfo { @@ -659,6 +617,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t }; struct { + int netDev; int collNetSupport; int nc; struct ncclGraphInfo tree; @@ -670,7 +629,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); #endif int idx; - NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx)); + NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx)); allGather3Data[rank].nc = 2; if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi) allGather3Data[rank].nc = 4; @@ -684,6 +643,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels); if (ringGraph.nChannels > MAXCHANNELS/2) allGather3Data[rank].nc = 1; + NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev)); allGather3Data[rank].tree.pattern = treeGraph.pattern; allGather3Data[rank].tree.nChannels = treeGraph.nChannels; allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; @@ -717,25 +677,57 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) { int rank = comm->rank; int nranks = comm->nRanks; + ncclResult_t ret; //NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data))); // Determine nNodes, firstRanks, ... int *nodesFirstRank, *nodesTreePatterns; NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks)); NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks)); - for (int i=0; inNodes; n++) { - if (nodesFirstRank[n] == firstRank) node = n; - } - if (node == -1) { - node = comm->nNodes++; + NCCLCHECK(ncclCalloc(&comm->rankToNode, comm->nRanks)); + for (int r=0; rnNodes && nodesFirstRank[node] != firstRank; node++); + if (node == comm->nNodes) { + comm->nNodes++; nodesFirstRank[node] = firstRank; // Record tree pattern of each node as they can be different depending on sm arch - nodesTreePatterns[node] = allGather3Data[i].tree.pattern; + nodesTreePatterns[node] = allGather3Data[r].tree.pattern; } - if (i == comm->rank) comm->node = node; + comm->rankToNode[r] = node; + } + // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node + NCCLCHECK(ncclCalloc(&comm->nodeRanks, comm->nNodes)); + NCCLCHECK(ncclCalloc(&comm->rankToLocalRank, comm->nRanks)); + for (int r=0; rnRanks; r++) { + int node = comm->rankToNode[r]; + comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks; + comm->nodeRanks[node].localRanks++; + } + // Allocate ranks arrays for each node + for (int n=0; nnNodes; n++) { + NCCLCHECK(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks)); + comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks); + comm->nodeRanks[n].localRanks = 0; + } + // And fill the ranks arrays + for (int r=0; rnRanks; r++) { + int node = comm->rankToNode[r]; + comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r; + } + comm->node = comm->rankToNode[rank]; + comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank; + comm->localRank = comm->rankToLocalRank[rank]; + comm->localRanks = comm->nodeRanks[comm->node].localRanks; + + TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d", + rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]); + if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) { + WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d", + rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash, + comm->localRank, comm->localRanks, comm->localRankToRank[0]); + return ncclInternalError; } int nChannelsOrig = comm->nChannels; @@ -743,6 +735,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks)); int nc = allGather3Data[0].nc; for (int i=0; ipeerInfo[i].netDev = allGather3Data[i].netDev; allTopoRanks[i] = &allGather3Data[i].topoRanks; nc = std::min(allGather3Data[i].nc, nc); // Make sure we align all ranks so that the tuning is consistent across ranks @@ -750,20 +743,20 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra); treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter); - treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); - treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter); + treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); + treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter); ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels); ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra); ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter); - ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); - ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter); + ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); + ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter); collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels); collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels); collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra); collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter); - collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); - collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); + collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); + collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport); } @@ -776,12 +769,20 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t for (int i=0; inChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel)); } - // Determine CollNet support after all-gather now that we know nNodes - int collNetNodeThreshold = ncclParamCollNetNodeThreshold(); - if (comm->nNodes < collNetNodeThreshold) { - if (comm->collNetSupport == 1) + // Determine CollNet support after all-gather now that we know nNodes and each node localRanks + if (comm->collNetSupport == 1) { + int collNetNodeThreshold = ncclParamCollNetNodeThreshold(); + if (comm->nNodes < collNetNodeThreshold) { INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold); - comm->collNetSupport = 0; + comm->collNetSupport = 0; + } + for (int n=0; nnNodes; n++) { + if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) { + WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1); + comm->collNetSupport = 0; + break; + } + } } int *rings; @@ -808,16 +809,6 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t line[1023] = '\0'; INFO(NCCL_INIT, "Trees%s", line); - // Set Affinity to a CPU local the our GPU, so that all memory we allocate - // on the host is local. - //NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity)); - //cpu_set_t affinitySave; - //if (CPU_COUNT(&comm->cpuAffinity)) { - // sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - // sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); - //} - ncclResult_t ret; - //NCCLCHECK(computeBuffSizes(comm)); // Connect with prev/next for each ring @@ -828,7 +819,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore); - if (ringGraph.nIntraChannels && rcclParamP2pNetDisable() == 0) { + if (ringGraph.nIntraChannels) { comm->useIntraNet = 1; // Connect NET for intranode use for (int c=0; cnChannels; c++) { @@ -854,7 +845,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t // Check if we can setup CollNet if (comm->collNetSupport > 0) { int collNetSetupFail = 0; - int highestTypes[NCCL_MAX_INTRA_RANKS] = {TRANSPORT_P2P}; + int highestTypes[NCCL_MAX_LOCAL_RANKS] = {TRANSPORT_P2P}; // Find all head ranks int nHeads = collNetGraph.nChannels; int *heads; @@ -894,13 +885,13 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t // Exchange highest intra-node transport type among ranks // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer - comm->intraHighestTransportType = highestTypes[comm->intraNodeRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; - //NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, highestTypes, sizeof(int))); - //for (int i=0; ilocalRanks; i++) { - //if (highestTypes[i] > comm->intraHighestTransportType) - //comm->intraHighestTransportType = highestTypes[i]; - //} - INFO(NCCL_INIT, "rank %d Connected CollNet", rank); + comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; + //NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int))); + for (int i=0; ilocalRanks; i++) { + if (highestTypes[i] > comm->intraHighestTransportType) + comm->intraHighestTransportType = highestTypes[i]; + } + INFO(NCCL_INIT, "rank %d Connected CollNet comm %p nRanks %02d", rank, comm, comm->nRanks); collnet_cleanup: free(heads); @@ -913,19 +904,96 @@ collnet_cleanup: TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); // Compute time models for algorithm and protocol combinations - NCCLCHECK(ncclTopoTuneModel(comm, 1, 1, &treeGraph, &ringGraph, &collNetGraph)); + do { + int myCompCap = comm->peerInfo[rank].cudaCompCap; + int minCompCap = myCompCap, maxCompCap = myCompCap; + for (int i = 0; i < nranks; i++) { + minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap); + maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap); + } + NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph)); + } while(0); // Compute nChannels per peer for p2p NCCLCHECK(ncclTopoComputeP2pChannels(comm)); +#if 0 + if (ncclParamNvbPreconnect()) { + // Connect p2p when using NVB path + int nvbNpeers; + int* nvbPeers; + NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers)); + for (int r=0; rnRanks + (comm->rank-peer)) % comm->nRanks; + for (int c=0; cp2pnChannelsPerPeer; c++) { + int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; + if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector + comm->connectRecv[peer] |= (1<nRanks - (comm->rank-peer)) % comm->nRanks; + for (int c=0; cp2pnChannelsPerPeer; c++) { + int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; + if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector + comm->connectSend[peer] |= (1<topo, comm->rank, &proxyConn.localRank)); + //NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn)); + //NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0)); - //NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm)); + // Then to remote ones when using PXN + if (ncclPxnDisable() == 0) { + int nranks; + int* pxnPeers; + NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks)); + for (int r=0; rp2pnChannels, sizeof(int), NULL, 0)); + } + free(pxnPeers); + } - //if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm)); + do { + // Compute intra-process ranks + int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0; + for (int i = 0; i < nranks; i++) { + if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) + && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) { + // Rank is in same process + if (intraProcRanks == 0) intraProcRank0 = i; + if (i == rank) intraProcRank = intraProcRanks; + intraProcRanks++; + } + } + TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", + rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0); + if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) { + WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", + rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash, + intraProcRank, intraProcRanks, intraProcRank0); + return ncclInternalError; + } + //NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm)); + } while(0); + + /* Local intra-node barrier */ + //NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); + + // Unlink proxy shm to make sure it will be properly cleaned up. + //NCCLCHECK(ncclProxyShmUnlink(comm)); // We should have allocated all buffers, collective fifos, ... we can // restore the affinity. affinity_restore: - //sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); + //if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); if (ret != ncclSuccess) return ret; TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);