diff --git a/ext-net/example/Makefile b/ext-net/example/Makefile index efa841c53c..e0a6aa6193 100644 --- a/ext-net/example/Makefile +++ b/ext-net/example/Makefile @@ -5,7 +5,7 @@ # NCCL_HOME:=../../build/ CUDA_HOME:=/usr/local/cuda -INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include +INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl PLUGIN_SO:=libnccl-net.so default: $(PLUGIN_SO) diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h index 6b5b62c729..f5101aec8b 100644 --- a/ext-net/example/nccl/net.h +++ b/ext-net/example/nccl/net.h @@ -24,6 +24,7 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); +#include "net_v7.h" #include "net_v6.h" #include "net_v5.h" #include "net_v4.h" diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h new file mode 100644 index 0000000000..32cc519ded --- /dev/null +++ b/ext-net/example/nccl/net_device.h @@ -0,0 +1,31 @@ +/************************************************************************* + * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_DEVICE_H_ +#define NET_DEVICE_H_ + +#include "net_device.h" + +#define NCCL_NET_DEVICE_INVALID_VERSION 0x0 +#define NCCL_NET_MTU_SIZE 4096 + +// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin +// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. +#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 + +typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; + +typedef struct { + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + void* handle; + size_t size; + int needsProxyProgress; +} ncclNetDeviceHandle_v7_t; + +typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t; + +#endif diff --git a/ext-net/example/nccl/net_v6.h b/ext-net/example/nccl/net_v6.h index 8bc16787b5..21379d3d11 100644 --- a/ext-net/example/nccl/net_v6.h +++ b/ext-net/example/nccl/net_v6.h @@ -18,8 +18,6 @@ typedef struct { int maxRecvs; // Maximum number of grouped receives. }ncclNetProperties_v6_t; -typedef ncclNetProperties_v6_t ncclNetProperties_t; - typedef struct { // Name of the network (mainly for logs) const char* name; diff --git a/ext-net/example/nccl/net_v7.h b/ext-net/example/nccl/net_v7.h new file mode 100644 index 0000000000..77d6cb73ee --- /dev/null +++ b/ext-net/example/nccl/net_v7.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_NET_V7_H_ +#define NCCL_NET_V7_H_ + +#include "net_device.h" + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload +} ncclNetProperties_v7_t; + +typedef ncclNetProperties_v7_t ncclNetProperties_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); +} ncclNet_v7_t; + +#endif // end include guard diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c index a44ce9e51d..cc860b0067 100644 --- a/ext-net/example/plugin.c +++ b/ext-net/example/plugin.c @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include +#include "net.h" #define __hidden __attribute__ ((visibility("hidden"))) @@ -15,14 +15,14 @@ __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; } __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } -__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v6_t* props) { +__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v7_t* props) { //pluginPciPath(dev, &props.pciPath); //pluginPtrSupport(dev, &props.ptrSupport); return ncclInternalError; } __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } -__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; } -__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; } +__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm) { return ncclInternalError; } +__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm) { return ncclInternalError; } __hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;} @@ -33,10 +33,12 @@ __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return n __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; } __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; } __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; } +__hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; } +__hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; } #define PLUGIN_NAME "Plugin" -const ncclNet_v6_t ncclNetPlugin_v6 = { +const ncclNet_v7_t ncclNetPlugin_v7 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, @@ -54,6 +56,37 @@ const ncclNet_v6_t ncclNetPlugin_v6 = { .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen, + .getDeviceMr = pluginGetDeviceMr, + .irecvConsumed = pluginIrecvConsumed, +}; + +__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props) { + //pluginPciPath(dev, &props.pciPath); + //pluginPtrSupport(dev, &props.ptrSupport); + return ncclInternalError; +} + +__hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; } +__hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { return ncclInternalError; } + +const ncclNet_v6_t ncclNetPlugin_v6 = { + .name = PLUGIN_NAME, + .init = pluginInit, + .devices = pluginDevices, + .getProperties = pluginGetProperties_v6, + .listen = pluginListen, + .connect = pluginConnect_v6, + .accept = pluginAccept_v6, + .regMr = pluginRegMr, + .regMrDmaBuf = pluginRegMrDmaBuf, + .deregMr = pluginDeregMr, + .isend = pluginIsend, + .irecv = pluginIrecv, + .iflush = pluginIflush, + .test = pluginTest, + .closeSend = pluginCloseSend, + .closeRecv = pluginCloseRecv, + .closeListen = pluginCloseListen }; /* v5 Compat */ @@ -61,10 +94,10 @@ const ncclNet_v5_t ncclNetPlugin_v5 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, - .getProperties = pluginGetProperties, + .getProperties = pluginGetProperties_v6, .listen = pluginListen, - .connect = pluginConnect, - .accept = pluginAccept, + .connect = pluginConnect_v6, + .accept = pluginAccept_v6, .regMr = pluginRegMr, .deregMr = pluginDeregMr, .isend = pluginIsend, @@ -79,7 +112,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = { /* v4 Compat */ static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) { ncclNetProperties_v6_t props_v6; - ncclResult_t ret = pluginGetProperties(dev, &props_v6); + ncclResult_t ret = pluginGetProperties_v6(dev, &props_v6); if (ret != ncclSuccess) return ret; props->name = props_v6.name; props->pciPath = props_v6.pciPath; @@ -103,14 +136,16 @@ static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) { ncclResult_t ret; do { - ret = pluginConnect(dev, handle, sendComm); + ncclNetDeviceHandle_v7_t* handle = NULL; + ret = pluginConnect(dev, handle, sendComm, &handle); } while (ret == ncclSuccess && *sendComm == NULL); return ret; } static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) { ncclResult_t ret; do { - ret = pluginAccept(listenComm, recvComm); + ncclNetDeviceHandle_v7_t* handle = NULL; + ret = pluginAccept(listenComm, recvComm, &handle); } while (ret == ncclSuccess && *recvComm == NULL); return ret; } @@ -151,12 +186,12 @@ static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) { static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) { char pluginHandle[NCCL_NET_HANDLE_MAXSIZE]; ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm); - memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V3); + memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4); return ret; } static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendComm) { char pluginHandle[NCCL_NET_HANDLE_MAXSIZE]; - memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V3); + memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V4); return pluginConnect_v4(dev, &pluginHandle, sendComm); } const ncclNet_v3_t ncclNetPlugin_v3 = { diff --git a/ext-tuner/example/Makefile b/ext-tuner/example/Makefile new file mode 100644 index 0000000000..9d9ace4842 --- /dev/null +++ b/ext-tuner/example/Makefile @@ -0,0 +1,17 @@ +# +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# +NCCL_HOME:=../../build/ +CUDA_HOME:=/usr/local/cuda +INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl +PLUGIN_SO:=libnccl-tuner.so + +default: $(PLUGIN_SO) + +$(PLUGIN_SO): plugin.c + $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ + +clean: + rm -f $(PLUGIN_SO) diff --git a/ext-tuner/example/nccl/tuner.h b/ext-tuner/example/nccl/tuner.h new file mode 100644 index 0000000000..8076aa872a --- /dev/null +++ b/ext-tuner/example/nccl/tuner.h @@ -0,0 +1,77 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TUNER_H_ +#define NCCL_TUNER_H_ + +#include "nccl.h" + +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys; + +typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); + +#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now +typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t; + +#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* +#define NCCL_ALGO_UNDEF -1 +#define NCCL_ALGO_TREE 0 +#define NCCL_ALGO_RING 1 +#define NCCL_ALGO_COLLNET_DIRECT 2 +#define NCCL_ALGO_COLLNET_CHAIN 3 +#define NCCL_ALGO_NVLS 4 +#define NCCL_ALGO_NVLS_TREE 5 + +#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 +#define NCCL_PROTO_UNDEF -1 +#define NCCL_PROTO_LL 0 +#define NCCL_PROTO_LL128 1 +#define NCCL_PROTO_SIMPLE 2 + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // nNodes: number of nodes in current communicator. + // logFunction: a logFunction can be useful to integrate logging together with NCCL core. + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - collNetSupport: whether collnet supports this type + // - nvlsSupport: whether nvlink sharp supports this time + // - numPipeOps: number of operations in the group + // + // Outputs: + // - algorithm: selected algorithm to be used for the given collective + // - protocol: selected protocol to be used for the given collective + // - nChannels: number of channels (hence SMs) to be used. + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes, + int collNetSupport, int nvlsSupport, int numPipeOps, + int *algorithm, int *protocol, int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + ncclResult_t (*destroy)(); +} ncclTuner_v1_t; + +typedef ncclTuner_v1_t ncclTuner_t; + +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1" + +#endif diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c new file mode 100644 index 0000000000..d972de3d3a --- /dev/null +++ b/ext-tuner/example/plugin.c @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "tuner.h" + +#define __hidden __attribute__ ((visibility("hidden"))) + +__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) { return ncclSuccess; } + +__hidden ncclResult_t pluginGetCollInfo(ncclFunc_t collType, size_t nBytes, + int collNetSupport, int nvlsSupport, int numPipeOps, + int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; } + +__hidden ncclResult_t pluginDestroy() { return ncclSuccess; } + +#define PLUGIN_NAME "Example" + +const ncclTuner_v1_t ncclTunerPlugin_v1 = { + .name = PLUGIN_NAME, + .init = pluginInit, + .getCollInfo = pluginGetCollInfo, + .destroy = pluginDestroy +}; diff --git a/makefiles/common.mk b/makefiles/common.mk index 60a019c0b2..a037cf348b 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -9,6 +9,7 @@ PREFIX ?= /usr/local VERBOSE ?= 0 KEEP ?= 0 DEBUG ?= 0 +ASAN ?= 0 TRACE ?= 0 PROFAPI ?= 1 NVTX ?= 1 @@ -85,6 +86,13 @@ NVCUFLAGS += -O0 -G -g CXXFLAGS += -O0 -g -ggdb3 endif +# Make sure to run with ASAN_OPTIONS=protect_shadow_gap=0 otherwise CUDA will fail with OOM +ifneq ($(ASAN), 0) +CXXFLAGS += -fsanitize=address +LDFLAGS += -fsanitize=address -static-libasan +NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan +endif + ifneq ($(VERBOSE), 0) NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter CXXFLAGS += -Wall -Wextra diff --git a/makefiles/version.mk b/makefiles/version.mk index fde92c08a0..5e32150b1c 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 18 -NCCL_PATCH := 6 +NCCL_MINOR := 19 +NCCL_PATCH := 3 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index dd5754989e..7a1881d9d6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -3,19 +3,17 @@ # # See LICENSE.txt for license information # - include ../makefiles/common.mk include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h -LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \ - misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc \ - misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \ - misc/ipcsocket.cc \ - transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \ - collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ - graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc +LIBSRCFILES := \ + bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \ + init.cc init_nvtx.cc net.cc proxy.cc transport.cc \ + $(wildcard graph/*.cc) \ + $(wildcard misc/*.cc) \ + $(wildcard transport/*.cc) ##### lib files LIBNAME := libnccl.so @@ -45,7 +43,7 @@ LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl -DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a +DEVMANIFEST := $(BUILDDIR)/obj/device/manifest ##### rules build : lib staticlib @@ -54,8 +52,8 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET) staticlib : $(LIBDIR)/$(STATICLIBTARGET) -$(DEVICELIB): ALWAYS_REBUILD $(INCTARGETS) - $(MAKE) -C collectives/device +$(DEVMANIFEST): ALWAYS_REBUILD $(INCTARGETS) + $(MAKE) -C ./device # Empty target to force rebuild ALWAYS_REBUILD: @@ -75,21 +73,17 @@ $(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \ $< > $@ -$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB) +$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVMANIFEST) @printf "Linking %-35s > %s\n" $(LIBTARGET) $@ mkdir -p $(LIBDIR) - $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS) + $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS) ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) -null := -space := $(null) # -comma := , - -$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB) +$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVMANIFEST) @printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@ mkdir -p $(LIBDIR) - printf "create $@\naddlib $(DEVICELIB)\naddmod $(subst $(space),$(comma),$(strip $(LIBOBJ)))\nsave\nend" | ar -M + ar cr $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(PKGDIR)/nccl.pc : nccl.pc.in mkdir -p $(PKGDIR) @@ -126,7 +120,7 @@ $(OBJDIR)/%.o : %.cc $(INCTARGETS) @rm -f $(@:%.o=%.d.tmp) clean : - $(MAKE) -C collectives/device clean + $(MAKE) -C device clean rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR} install : build diff --git a/src/bootstrap.cc b/src/bootstrap.cc index 764cb6c391..0c8a338d6e 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -12,6 +12,7 @@ #include #include #include "proxy.h" +#include "param.h" struct bootstrapRootArgs { struct ncclSocket* listenSock; @@ -28,21 +29,24 @@ ncclResult_t bootstrapNetInit() { if (bootstrapNetInitDone == 0) { pthread_mutex_lock(&bootstrapNetLock); if (bootstrapNetInitDone == 0) { - char* env = getenv("NCCL_COMM_ID"); + const char* env = ncclGetEnv("NCCL_COMM_ID"); if (env) { union ncclSocketAddress remoteAddr; if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) { WARN("Invalid NCCL_COMM_ID, please use format: : or []: or :"); + pthread_mutex_unlock(&bootstrapNetLock); return ncclInvalidArgument; } if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { WARN("NET/Socket : No usable listening interface found"); + pthread_mutex_unlock(&bootstrapNetLock); return ncclSystemError; } } else { int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); if (nIfs <= 0) { WARN("Bootstrap : no socket interface found"); + pthread_mutex_unlock(&bootstrapNetLock); return ncclInternalError; } } @@ -189,7 +193,7 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) { memset(handle, 0, sizeof(ncclBootstrapHandle)); NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic))); - char* env = getenv("NCCL_COMM_ID"); + const char* env = ncclGetEnv("NCCL_COMM_ID"); if (env) { INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env); if (ncclSocketGetAddrFromString(&handle->addr, env) != ncclSuccess) { diff --git a/src/collectives.cc b/src/collectives.cc new file mode 100644 index 0000000000..89d8932db8 --- /dev/null +++ b/src/collectives.cc @@ -0,0 +1,167 @@ +/************************************************************************* + * Copyright (c) 2015-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "argcheck.h" // Need some checks here since we access comm +#include "collectives.h" +#include "enqueue.h" +#include "nccl.h" + +NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + // Just pass the size of one message and not the total bytes sent/received. + constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = { + {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"} + }; + size_t msgsize = sendcount * ncclTypeSize(datatype); + NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize) + + struct ncclInfo info = { ncclFuncAllGather, "AllGather", + sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */ + ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} + +NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); +ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { + struct NvtxParamsAllReduce { + size_t bytes; + ncclRedOp_t op; + }; + // Just pass the size of one message and not the total bytes sent/received. + static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = { + {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, + {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, + offsetof(NvtxParamsAllReduce, op)} + }; + NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op}; + NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload) + + struct ncclInfo info = { ncclFuncAllReduce, "AllReduce", + sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ + ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} + +NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + struct NvtxParamsBroadcast { + size_t bytes; + int root; + }; + constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = { + {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)} + }; + NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root}; + NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload) + + struct ncclInfo info = { ncclFuncBroadcast, "Broadcast", + sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ + BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} +/* Deprecated original "in place" function, similar to MPI */ +NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + return ncclBroadcast(buff, buff, count, datatype, root, comm, stream); +} + +NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + struct NvtxParamsReduce { + size_t bytes; + int root; + ncclRedOp_t op; + }; + constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = { + {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)}, + {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, + offsetof(NvtxParamsReduce, op)} + }; + NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op}; + NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload) + + struct ncclInfo info = { ncclFuncReduce, "Reduce", + sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */ + REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} + +NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); +ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { + struct NvtxParamsReduceScatter { + size_t bytes; + ncclRedOp_t op; + }; + constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = { + {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, + {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, + offsetof(NvtxParamsReduceScatter, op)} + }; + NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op}; + NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload) + + struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter", + sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */ + REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} + +struct NvtxParamsSendRecv { + size_t bytes; + int peer; +}; +constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = { + {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, + {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)} +}; + +NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream) { + NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; + NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload) + + struct ncclInfo info = { ncclFuncSend, "Send", + NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ + 1, 1 }; + ncclResult_t ret; + NCCLCHECK(ncclGroupStart()); + ret = ncclEnqueueCheck(&info); + NCCLCHECK(ncclGroupEnd()); + return ret; +} + +NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream) { + NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; + NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload) + + struct ncclInfo info = { ncclFuncRecv, "Recv", + NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ + 1, 1 }; + ncclResult_t ret; + NCCLCHECK(ncclGroupStart()); + ret = ncclEnqueueCheck(&info); + NCCLCHECK(ncclGroupEnd()); + return ret; +} diff --git a/src/collectives/all_gather.cc b/src/collectives/all_gather.cc deleted file mode 100644 index 97ec981ed4..0000000000 --- a/src/collectives/all_gather.cc +++ /dev/null @@ -1,25 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "enqueue.h" -#include "collectives.h" - -NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { - // Just pass the size of one message and not the total bytes sent/received. - constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"} - }; - size_t msgsize = sendcount * ncclTypeSize(datatype); - NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize) - - struct ncclInfo info = { ncclFuncAllGather, "AllGather", - sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */ - ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; - return ncclEnqueueCheck(&info); -} diff --git a/src/collectives/all_reduce.cc b/src/collectives/all_reduce.cc deleted file mode 100644 index 8ac61a2a78..0000000000 --- a/src/collectives/all_reduce.cc +++ /dev/null @@ -1,31 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "enqueue.h" -#include "nccl.h" - -NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); -ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { - struct NvtxParamsAllReduce { - size_t bytes; - ncclRedOp_t op; - }; - // Just pass the size of one message and not the total bytes sent/received. - static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, - {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, - offsetof(NvtxParamsAllReduce, op)} - }; - NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op}; - NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload) - - struct ncclInfo info = { ncclFuncAllReduce, "AllReduce", - sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ - ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; - return ncclEnqueueCheck(&info); -} diff --git a/src/collectives/broadcast.cc b/src/collectives/broadcast.cc deleted file mode 100644 index c73502eedb..0000000000 --- a/src/collectives/broadcast.cc +++ /dev/null @@ -1,37 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "enqueue.h" -#include "collectives.h" - -NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream) { - struct NvtxParamsBroadcast { - size_t bytes; - int root; - }; - constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)} - }; - NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root}; - NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload) - - struct ncclInfo info = { ncclFuncBroadcast, "Broadcast", - sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ - BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; - return ncclEnqueueCheck(&info); -} -/* Deprecated original "in place" function, similar to MPI */ -NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream) { - return ncclBroadcast(buff, buff, count, datatype, root, comm, stream); -} - diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile deleted file mode 100644 index a2498a00c7..0000000000 --- a/src/collectives/device/Makefile +++ /dev/null @@ -1,76 +0,0 @@ -# -# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. -# -# See LICENSE.txt for license information -# - -include ../../../makefiles/common.mk -include ../../../makefiles/version.mk - -BUILDDIR ?= $(abspath ../../../build) -OBJDIR := $(BUILDDIR)/obj/collectives/device - -LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu - -LIBSRCFILES += functions.cu - -DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES)) -DEPENDFILES:= $(DEPFILES:%.d=%.dep) -STATICLIB := $(OBJDIR)/colldevice.a -DEVOBJ := $(OBJDIR)/devlink.o -RULESFILE := $(OBJDIR)/Makefile.rules - -NVCUFLAGS += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" - - -all: $(STATICLIB) - -# Dummy rule so that the extra dependency (%.dep) files are preserved by make -all_deps: $(DEPENDFILES) - -# Auto-generating the rules per op/reduction/datatype/algorithm -$(RULESFILE) : gen_rules.sh - @printf "Generating %-35s > %s\n" rules $@ - @mkdir -p $(OBJDIR) - @CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@ - --include $(RULESFILE) - -LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o $(OBJDIR)/onerank_reduce.o - --include $(DEPFILES) - -$(STATICLIB): $(LIBOBJ) $(DEVOBJ) - @printf "Archiving %-35s > %s\n" objects $@ - ar cr $@ $^ - -# We do not want make to build *.d when running make clean. -# So we only provide targets for .dep which will produce .dep and .d, -# with only .d being included, and .dep keeping track of what needs to -# be regenerated. -$(OBJDIR)/%.dep : %.cu - @mkdir -p $(OBJDIR) - @$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp - @sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@ - @sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \ - sed -e 's/^ *//' -e 's/$$/:/' >> $@ - @rm -f $@.tmp - @cp $@ $(@:.dep=.d) - -# Compiled kernels and collectives with relocatable device code ... -$(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) $(NVCUFLAGS) -dc $< -o $@ - -$(OBJDIR)/onerank_reduce.o : onerank_reduce.cu $(OBJDIR)/onerank_reduce.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) $(NVCUFLAGS) -dc $< -o $@ - -# ... and create the device-side linked object with all those. -$(DEVOBJ) : $(LIBOBJ) - $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ - -clean: - rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB) diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu deleted file mode 100644 index 4022e2e9f5..0000000000 --- a/src/collectives/device/all_gather.cu +++ /dev/null @@ -1,11 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "all_gather.h" -#include "common.h" -#include "collectives.h" - -IMPL_COLL_C(AllGather); diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu deleted file mode 100644 index e7c3c28cfb..0000000000 --- a/src/collectives/device/all_reduce.cu +++ /dev/null @@ -1,11 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "all_reduce.h" -#include "common.h" -#include "collectives.h" - -IMPL_COLL_R(AllReduce); diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu deleted file mode 100644 index 77595858bf..0000000000 --- a/src/collectives/device/broadcast.cu +++ /dev/null @@ -1,11 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "broadcast.h" -#include "common.h" -#include "collectives.h" - -IMPL_COLL_C(Broadcast); diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu deleted file mode 100644 index 52e9efe842..0000000000 --- a/src/collectives/device/functions.cu +++ /dev/null @@ -1,122 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "devcomm.h" -#include "collectives.h" -#include "common.h" - -__shared__ ncclShmemData ncclShmem; -#if __CUDA_ARCH__ < 700 - __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)]; -#endif - -#define NCCL_FUNC5(func, algo, devredop, type, nullify) \ - MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \ - MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128, devredop, type)), \ - MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type)) - -#define NCCL_FUNC4(func, devredop, type, nullify) \ - NCCL_FUNC5(func, TREE, devredop, type, nullify), \ - NCCL_FUNC5(func, RING, devredop, type, nullify), \ - NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \ - NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \ - NCCL_FUNC5(func, NVLS, devredop, type, nullify), \ - NCCL_FUNC5(func, NVLS_TREE, devredop, type, nullify) - -#if defined(__CUDA_BF16_TYPES_EXIST__) -// Must be consistent with ncclDataType_t -#define NCCL_FUNCS3A(func, devredop, nullForFloat) \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, uint8_t, 0), \ - NCCL_FUNC4(func, devredop, int32_t, 0), \ - NCCL_FUNC4(func, devredop, uint32_t, 0), \ - NCCL_FUNC4(func, devredop, int64_t, 0), \ - NCCL_FUNC4(func, devredop, uint64_t, 0), \ - NCCL_FUNC4(func, devredop, half, nullForFloat), \ - NCCL_FUNC4(func, devredop, float, nullForFloat), \ - NCCL_FUNC4(func, devredop, double, nullForFloat), \ - NCCL_FUNC4(func, devredop, __nv_bfloat16, nullForFloat) -#define NCCL_FUNCS3B(func, devredop) \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0) -#else -// Must be consistent with ncclDataType_t -#define NCCL_FUNCS3A(func, devredop, nullForFloat) \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, uint8_t, 0), \ - NCCL_FUNC4(func, devredop, int32_t, 0), \ - NCCL_FUNC4(func, devredop, uint32_t, 0), \ - NCCL_FUNC4(func, devredop, int64_t, 0), \ - NCCL_FUNC4(func, devredop, uint64_t, 0), \ - NCCL_FUNC4(func, devredop, half, nullForFloat), \ - NCCL_FUNC4(func, devredop, float, nullForFloat), \ - NCCL_FUNC4(func, devredop, double, nullForFloat) -#define NCCL_FUNCS3B(func, devredop) \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0), \ - NCCL_FUNC4(func, devredop, int8_t, 0) -#endif - -// Must be consistent with ncclRedOp_t -#define NCCL_FUNCS2A(func) \ - NCCL_FUNCS3A(func, Sum, /*nullForFloat=*/0), \ - NCCL_FUNCS3A(func, Prod, /*nullForFloat=*/0), \ - NCCL_FUNCS3A(func, Max, /*nullForFloat=*/0), \ - NCCL_FUNCS3A(func, Min, /*nullForFloat=*/0), \ - NCCL_FUNCS3A(func, PreMulSum, /*nullForFloat=*/0), \ - NCCL_FUNCS3A(func, SumPostDiv, /*nullForFloat=*/1) - -#define NCCL_FUNCS2B(func) \ - NCCL_FUNCS3B(func, Sum), \ - NCCL_FUNCS3B(func, Sum), \ - NCCL_FUNCS3B(func, Sum), \ - NCCL_FUNCS3B(func, Sum), \ - NCCL_FUNCS3B(func, Sum), \ - NCCL_FUNCS3B(func, Sum) - -// Must be consistent with the ncclFuncSet enum -__device__ ncclKern_t ncclFuncs[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { -// Don't try to initialize the host shadow copy of this device-side global -// variable. There is no host pointer to a device-side function, which -// confuses clang. This will be fixed in the next clang release. -#if __CUDA_ARCH__ - NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), - NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t), - NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t), - NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t), - NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t), - NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t), - NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t), - NCCL_ONERANK_REDUCE_NAME(PreMulSum, half), - NCCL_ONERANK_REDUCE_NAME(PreMulSum, float), - NCCL_ONERANK_REDUCE_NAME(PreMulSum, double), - #if defined(__CUDA_BF16_TYPES_EXIST__) - NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16), - #endif - NCCL_FUNCS2B(Broadcast), - NCCL_FUNCS2A(Reduce), - NCCL_FUNCS2B(AllGather), - NCCL_FUNCS2A(ReduceScatter), - NCCL_FUNCS2A(AllReduce) -#endif -}; - -// Workaround for https://reviews.llvm.org/D55580 -__device__ void ncclWorkaroundClangD55580() {} diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh deleted file mode 100755 index 8c7387c701..0000000000 --- a/src/collectives/device/gen_rules.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# -# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. -# -# See LICENSE.txt for license information -# - -dir=$1 - -datatypes="i8 u8 i32 u32 i64 u64 f16 f32 f64" -if [ "$CUDA_MAJOR" -ge 11 ] -then - datatypes+=" bf16" -fi - -targets="GENOBJS := \\\\\n" - -for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do - opn=0 - for op in sum prod min max premulsum sumpostdiv; do - dtn=0 - # Order must match that of the ncclDataType_t enum - for dt in ${datatypes}; do - # Generate a unique filename for each compilation unit, - # otherwise the __nv_module_id may conflict at link time - echo "${dir}/${base}_${op}_${dt}.cu : ${base}.cu" - echo " @printf \"Copying %-35s > %s\\\\n\" \$< \$@" - echo " cp \$< \$@" - echo "" - # Compile the file - echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${op}_${dt}.cu ${base}.cu ${dir}/${base}.dep" - - echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o" - echo " mkdir -p ${dir}" - echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@" - echo "" - targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n" - dtn=$(($dtn + 1)) - done - opn=$(($opn + 1)) - done -done -echo -e "$targets" diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu deleted file mode 100644 index 552d1f2050..0000000000 --- a/src/collectives/device/onerank_reduce.cu +++ /dev/null @@ -1,62 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "devcomm.h" -#include "collectives.h" -#include "common_kernel.h" -#include "common.h" - -namespace { - template - __device__ __forceinline__ void oneRankReduce() { - ncclWork *w = &ncclShmem.work; - int tid = threadIdx.x; - int tn = blockDim.x; - #pragma unroll 1 - for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) { - ncclWorkElem *we = &w->elems[e]; - intptr_t eltN = we->count; - int bid = we->bid; - int bn = we->nChannels; - T const *src = (T const*)we->sendbuff; - T *dst = (T*)we->recvbuff; - - // each block/channel gets a roughly equal segment of 16 byte packs - constexpr int EltPerPack = 16/sizeof(T); - intptr_t packN = (eltN + EltPerPack-1) - (eltN + EltPerPack-1)%EltPerPack; - intptr_t i0 = (bid+0)*(packN/bn) + (bid+0 < packN%bn ? bid+0 : packN%bn); - intptr_t i1 = (bid+1)*(packN/bn) + (bid+1 < packN%bn ? bid+1 : packN%bn); - i0 *= EltPerPack; - i0 = i0 < eltN ? i0 : eltN; - i1 *= EltPerPack; - i1 = i1 < eltN ? i1 : eltN; - src += i0; - dst += i0; - void *vsrc = (void*)src; - void *vdst = (void*)dst; - reduceCopy - (tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0); - } - } -} - -#define INSTANTIATE(devredop, type) \ - __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \ - oneRankReduce>(); \ - } - -INSTANTIATE(PreMulSum, int8_t) -INSTANTIATE(PreMulSum, uint8_t) -INSTANTIATE(PreMulSum, int32_t) -INSTANTIATE(PreMulSum, uint32_t) -INSTANTIATE(PreMulSum, int64_t) -INSTANTIATE(PreMulSum, uint64_t) -INSTANTIATE(PreMulSum, half) -#if defined(__CUDA_BF16_TYPES_EXIST__) -INSTANTIATE(PreMulSum, __nv_bfloat16) -#endif -INSTANTIATE(PreMulSum, float) -INSTANTIATE(PreMulSum, double) diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu deleted file mode 100644 index 66f1bb2ec2..0000000000 --- a/src/collectives/device/reduce.cu +++ /dev/null @@ -1,11 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "reduce.h" -#include "common.h" -#include "collectives.h" - -IMPL_COLL_R(Reduce); diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu deleted file mode 100644 index c2c6d42806..0000000000 --- a/src/collectives/device/reduce_scatter.cu +++ /dev/null @@ -1,11 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "reduce_scatter.h" -#include "common.h" -#include "collectives.h" - -IMPL_COLL_R(ReduceScatter); diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrecv.cu deleted file mode 100644 index 59e38b528e..0000000000 --- a/src/collectives/device/sendrecv.cu +++ /dev/null @@ -1,11 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "sendrecv.h" -#include "common.h" -#include "collectives.h" - -IMPL_COLL_P(SendRecv); diff --git a/src/collectives/reduce.cc b/src/collectives/reduce.cc deleted file mode 100644 index 63355162f7..0000000000 --- a/src/collectives/reduce.cc +++ /dev/null @@ -1,33 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "enqueue.h" -#include "collectives.h" -#include "nccl.h" - -NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - struct NvtxParamsReduce { - size_t bytes; - int root; - ncclRedOp_t op; - }; - constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)}, - {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, - offsetof(NvtxParamsReduce, op)} - }; - NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op}; - NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload) - - struct ncclInfo info = { ncclFuncReduce, "Reduce", - sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */ - REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; - return ncclEnqueueCheck(&info); -} diff --git a/src/collectives/reduce_scatter.cc b/src/collectives/reduce_scatter.cc deleted file mode 100644 index 5242545490..0000000000 --- a/src/collectives/reduce_scatter.cc +++ /dev/null @@ -1,31 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "enqueue.h" -#include "collectives.h" -#include "nccl.h" - -NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); -ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { - struct NvtxParamsReduceScatter { - size_t bytes; - ncclRedOp_t op; - }; - constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, - {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, - offsetof(NvtxParamsReduceScatter, op)} - }; - NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op}; - NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload) - - struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter", - sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */ - REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; - return ncclEnqueueCheck(&info); -} diff --git a/src/collectives/sendrecv.cc b/src/collectives/sendrecv.cc deleted file mode 100644 index 9a81b0a935..0000000000 --- a/src/collectives/sendrecv.cc +++ /dev/null @@ -1,52 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "enqueue.h" -#include "collectives.h" -#include "argcheck.h" // Need some checks here since we access comm - -struct NvtxParamsSendRecv { - size_t bytes; - int peer; -}; -constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)} -}; - -NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream) { - NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; - NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload) - - struct ncclInfo info = { ncclFuncSend, "Send", - NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ - 1, 1 }; - ncclResult_t ret; - NCCLCHECK(ncclGroupStart()); - ret = ncclEnqueueCheck(&info); - NCCLCHECK(ncclGroupEnd()); - return ret; -} - -NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream) { - NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; - NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload) - - struct ncclInfo info = { ncclFuncRecv, "Recv", - NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ - 1, 1 }; - ncclResult_t ret; - NCCLCHECK(ncclGroupStart()); - ret = ncclEnqueueCheck(&info); - NCCLCHECK(ncclGroupEnd()); - return ret; -} diff --git a/src/debug.cc b/src/debug.cc index b88fa5982a..21cec22faa 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -9,6 +9,7 @@ #include #include #include +#include "param.h" int ncclDebugLevel = -1; static int pid = -1; @@ -25,7 +26,7 @@ static __thread int tid = -1; void ncclDebugInit() { pthread_mutex_lock(&ncclDebugLock); if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; } - const char* nccl_debug = getenv("NCCL_DEBUG"); + const char* nccl_debug = ncclGetEnv("NCCL_DEBUG"); int tempNcclDebugLevel = -1; if (nccl_debug == NULL) { tempNcclDebugLevel = NCCL_LOG_NONE; @@ -45,7 +46,7 @@ void ncclDebugInit() { * This can be a comma separated list such as INIT,COLL * or ^INIT,COLL etc */ - char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS"); + const char* ncclDebugSubsysEnv = ncclGetEnv("NCCL_DEBUG_SUBSYS"); if (ncclDebugSubsysEnv != NULL) { int invert = 0; if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; } @@ -97,7 +98,7 @@ void ncclDebugInit() { * then create the debug file. But don't bother unless the * NCCL_DEBUG level is > VERSION */ - const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE"); + const char* ncclDebugFileEnv = ncclGetEnv("NCCL_DEBUG_FILE"); if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) { int c = 0; char debugFn[PATH_MAX+1] = ""; diff --git a/src/device/Makefile b/src/device/Makefile new file mode 100644 index 0000000000..1e9311f1f0 --- /dev/null +++ b/src/device/Makefile @@ -0,0 +1,100 @@ +# +# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +SHELL := /usr/bin/env bash +MAKEFALGS += -r +.SUFFIXES: +.SECONDARY: + +NCCLDIR := ../.. +include $(NCCLDIR)/makefiles/common.mk +include $(NCCLDIR)/makefiles/version.mk + +BUILDDIR ?= $(abspath ../../build) +OBJDIR := $(BUILDDIR)/obj/device + +MANIFEST := $(OBJDIR)/manifest +DEVGLUE_OBJ := $(OBJDIR)/device_glue.o + +INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include +NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden" +CXXFLAGS += $(INCFLAGS) + +SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY + +COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1 +COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1 +define COMPILE +@$(SAY) "Compiling" $2;\ + mkdir -p $(dir $1);\ + $(call COMPILE$(suffix $2),$1,$2) +endef + +DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1 +DEPENDS.cc = $(CXX) $(CXXFLAGS) -M -c $1 +define DEPENDS +@$(SAY) "Dependencies" $2;\ + mkdir -p $(dir $1);\ + mk=$$($(call DEPENDS$(suffix $2),$2));\ + [[ $$mk =~ ^[^:]*:(.*)$$ ]];\ + files=$${BASH_REMATCH[1]};\ + files=$$(for x in $$files; do case "$$x" in '\'|$$'\t') ;; *) echo "$$x"; esac; done);\ + files=$$(for x in $$files; do [[ "$$(realpath "$$x")" == "$$(realpath "$(NCCLDIR)")"* ]] && echo "$$x"; done);\ + echo "$(patsubst %.d,%.o,$1) $1: " $$files > $1 +endef + +all: $(MANIFEST) + +ifeq (1,1) +# Case if the directory is generated on-demand: +$(OBJDIR)/gensrc: generate.py + @mkdir -p $@ + (which python3 >/dev/null || \ + (bar='!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; \ + printf "\n$${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n$${bar}\n\n" 1>&2; \ + exit 1)) \ + && ./generate.py $@ "$(ONLY_FUNCS)" +else +# Case if the directory is pre-generated and checked in the repo as ./gen: +$(OBJDIR)/gensrc: + @mkdir -p $(OBJDIR); ln -srfn ./gen $@ +endif + +# The trailing ";" is necessary to make this an "empty recipe": +# https://www.gnu.org/software/make/manual/html_node/Empty-Recipes.html +$(OBJDIR)/gensrc/rules.mk: $(OBJDIR)/gensrc ; + +-include $(OBJDIR)/gensrc/rules.mk +# "gensrc/rules.mk" populates $(LIB_OBJS_GEN) + +SRCS = common.cu onerank.cu + +LIB_OBJS = $(patsubst %, $(OBJDIR)/%.o, $(SRCS)) $(LIB_OBJS_GEN) + +$(OBJDIR)/%.o: % $(OBJDIR)/%.d + $(call COMPILE,$@,$<) + +$(OBJDIR)/genobj/%.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/%.d + $(call COMPILE,$@,$(OBJDIR)/gensrc/$*) + +$(OBJDIR)/%.d: % + $(call DEPENDS,$@,$<) + +$(OBJDIR)/genobj/%.d: $(OBJDIR)/gensrc/% + $(call DEPENDS,$@,$<) + +$(DEVGLUE_OBJ): $(LIB_OBJS) + $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ + +$(MANIFEST): $(LIB_OBJS) $(DEVGLUE_OBJ) + @echo $^ > $@ + +-include $(wildcard $(OBJDIR)/*.d) +-include $(wildcard $(OBJDIR)/genobj/*.d) + +.PHONY: clean +clean: + rm -rf $(OBJDIR) diff --git a/src/collectives/device/all_gather.h b/src/device/all_gather.h similarity index 61% rename from src/collectives/device/all_gather.h rename to src/device/all_gather.h index 76ae2a4182..0122499320 100644 --- a/src/collectives/device/all_gather.h +++ b/src/device/all_gather.h @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "devcomm.h" +#include "device.h" #include "collectives.h" #include "primitives.h" @@ -108,33 +108,65 @@ struct RunWorkElementlastChunkSize); const ssize_t size = args->count; const ssize_t loopSize = nChannels*chunkSize; + const ssize_t rank = ncclShmem.comm.rank; - const int nThreadsGather = 128; - const int nThreadsBcast = 384 + WARP_SIZE; + const int nThreadsBcast = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE; + const int nThreadsGather = args->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast; const int tidEndGather = nThreadsGather; const int tidEndBcast = tidEndGather + nThreadsBcast; - using Proto = ProtoSimple<1, 1>; - - if (tid < tidEndGather) { - // Gather - Primitives, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, - args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0); - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + bid*chunkSize; - int nelem = min(chunkSize, size-offset); - prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0); + if (!args->regUsed) { + if (tid < tidEndGather) { + // Gather + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, + args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + bid * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.gather(offset, nvls->nHeads * size, nelem, size, -1, 0); + } + } else if (tid < tidEndBcast) { + // Bcast through NVLS + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, + args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + bid * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.send(offset, nelem); + } } - } else if (tid < tidEndBcast) { - // Bcast through NVLS - Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, - args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + bid*chunkSize; - int nelem = min(chunkSize, size-offset); - prims.send(offset, nelem); + } else { + /* direct allgather */ + if (tid < tidEndGather) { + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL, + args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + + /* used as sync */ + prims.scatter(0, 0, 0, 0, -1, 0); + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + prims.gather(0, 0, 0, 0, -1, 0); + } + } else if (tid < tidEndBcast) { + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, args->sendbuff, NULL, + args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, args); + /* used as sync */ + prims.recv(0, 0); + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t inpOffset = gridOffset + bid * chunkSize; + ssize_t outOffset = inpOffset + rank * size; + int nelem = min(chunkSize, size - inpOffset); + prims.directSend(inpOffset, outOffset, nelem); + } } } } diff --git a/src/collectives/device/all_reduce.h b/src/device/all_reduce.h similarity index 85% rename from src/collectives/device/all_reduce.h rename to src/device/all_reduce.h index 32597f1769..bf37dfe962 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/device/all_reduce.h @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "devcomm.h" +#include "device.h" #include "collectives.h" #include "primitives.h" @@ -377,7 +377,6 @@ struct RunWorkElement struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { - #if NCCL_NVLS_ENABLED const int tid = threadIdx.x; const int bid = args->bid; const int nChannels = args->nChannels; @@ -387,10 +386,11 @@ struct RunWorkElementnHeads*chunkSize; const int nranks = ncclShmem.comm.nRanks; const bool hasOut = nvls->out != -1; - const int reduceWarps = hasOut ? 3 : nranks <= 6 ? 7 : 5; - const int bcastWarps = hasOut ? 2 : 0; - const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2; - const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2; + const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE; + const int bcastWarps = hasOut ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0; + const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5); + const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1; + const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1; const int nThreadsScatter = scatterWarps*WARP_SIZE; const int nThreadsGather = gatherWarps*WARP_SIZE; @@ -406,67 +406,65 @@ struct RunWorkElement; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, - args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1); + args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize; - int nelem = min(nvls->nHeads*chunkSize, size-offset); + ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize; + int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset); prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, - args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1); + prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, + args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize; - int nelem = min(nvls->nHeads*chunkSize, size-offset); + ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize; + int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndReduce && nvls->headRank != -1) { if (!hasOut) { // Reduce, broadcast through NVLS using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>; - Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL, - args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0); + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL, + args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; - int nelem = min(chunkSize, size-offset); - prims.recvSend(nelem); + ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.directRecvDirectSend(offset, offset, nelem); } } else { // Reduce, send to network using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; - Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL, - args->redOpArg, 2*Proto::MaxGroupWidth, 0, 1); + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL, + args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; - int nelem = min(chunkSize, size-offset); - prims.recvSend(nelem); + ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.directRecvDirectSend(offset, offset, nelem); } } } else if (tid < tidEndBcast && nvls->headRank != -1) { // Recv from network, broadcast using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; - Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL, - args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0); + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL, + args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; - int nelem = min(chunkSize, size-offset); - prims.recvSend(nelem); + ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.directRecvDirectSend(offset, offset, nelem); } } - #endif // NCCL_NVLS_ENABLED } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { - #if NCCL_NVLS_ENABLED const int tid = threadIdx.x; const int bid = args->bid; const int nChannels = args->nChannels; @@ -478,10 +476,11 @@ struct RunWorkElementnHeads*chunkSize; const int nranks = ncclShmem.comm.nRanks; const bool hasUp = treeUp != -1; - const int reduceWarps = hasUp ? 5 : nranks <= 6 ? 7 : 5; - const int bcastWarps = hasUp ? 4 : 0; - const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2; - const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2; + const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE; + const int bcastWarps = hasUp ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0; + const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5); + const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1; + const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1; const int nThreadsScatter = scatterWarps*WARP_SIZE; const int nThreadsGather = gatherWarps*WARP_SIZE; @@ -497,60 +496,59 @@ struct RunWorkElement; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, - args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1); + args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize; - int nelem = min(nvls->nHeads*chunkSize, size-offset); + ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize; + int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset); prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, - args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1); + prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, + args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize; - int nelem = min(nvls->nHeads*chunkSize, size-offset); + ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize; + int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndReduce && nvls->headRank != -1) { if (!hasUp) { // Reduce and Broadcast using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>; - Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL, - args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0); + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL, + args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; - int nelem = min(chunkSize, size-offset); - prims.recvSend(nelem); + ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.directRecvDirectSend(offset, offset, nelem); } } else { // Reduce, send to network using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; - Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL, - args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0); + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL, + args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; - int nelem = min(chunkSize, size-offset); - prims.recvSend(nelem); + ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.directRecvDirectSend(offset, offset, nelem); } } } else if (tid < tidEndBcast && nvls->headRank != -1) { // Recv from network, broadcast using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; - Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL, - args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0); + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL, + args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; - int nelem = min(chunkSize, size-offset); - prims.recvSend(nelem); + ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.directRecvDirectSend(offset, offset, nelem); } } - #endif // NCCL_NVLS_ENABLED } }; diff --git a/src/collectives/device/broadcast.h b/src/device/broadcast.h similarity index 99% rename from src/collectives/device/broadcast.h rename to src/device/broadcast.h index ebe4381206..15bf841d50 100644 --- a/src/collectives/device/broadcast.h +++ b/src/device/broadcast.h @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "devcomm.h" +#include "device.h" #include "collectives.h" #include "primitives.h" diff --git a/src/device/common.cu b/src/device/common.cu new file mode 100644 index 0000000000..d1b6acd1bc --- /dev/null +++ b/src/device/common.cu @@ -0,0 +1,24 @@ +/************************************************************************* + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "device.h" +#include "collectives.h" +#include "common.h" + +__shared__ ncclShmemData ncclShmem; +#if __CUDA_ARCH__ < 700 + __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)]; +#endif + +struct RunWorkNop { + __device__ void run(ncclWork *w) {} +}; + +__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { + ncclKernelMain<-1, RunWorkNop>(comm, channelMask, workHead); +} + +__device__ void ncclDevFunc_Nop() {} diff --git a/src/collectives/device/common.h b/src/device/common.h similarity index 61% rename from src/collectives/device/common.h rename to src/device/common.h index accf8371a3..97581f738d 100644 --- a/src/collectives/device/common.h +++ b/src/device/common.h @@ -8,19 +8,23 @@ #define NCCL_DEVICE_COMMON_H_ #include "collectives.h" -#include "devcomm.h" +#include "device.h" #include "op128.h" +#include "network/unpack/unpack_defs.h" #define COLL_UNROLL (ncclCollUnroll()) -typedef void(*ncclKern_t)(); -extern __device__ ncclKern_t ncclFuncs[]; +typedef void(*ncclDevFuncPtr_t)(); +extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[]; struct ncclShmemGroup { ncclConnInfo *recvConns[NCCL_MAX_NVLS_ARITY]; ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY]; void* srcs[NCCL_MAX_NVLS_ARITY+1]; void* dsts[NCCL_MAX_NVLS_ARITY+1]; + union { + unpackGroupShmem unpack; + } devicePlugin; }; struct ncclShmemData { @@ -31,6 +35,9 @@ struct ncclShmemData { alignas(16) struct ncclDevComm comm; alignas(16) struct ncclDevChannel channel; alignas(16) struct ncclWork work; + alignas(16) union { + unpackShmem unpack; + } devicePlugin; }; static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned"); @@ -111,10 +118,8 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) { } } -template -__device__ void ncclKernel( - struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead - ) { +template +__device__ void ncclKernelMain(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { int tid = threadIdx.x; // To map blockId to channelId, we need the n'th set bit of channelMask which @@ -166,7 +171,7 @@ __device__ void ncclKernel( bytes = 0; break; } - copyToShmem16(tid%WARP_SIZE, dst, src, bytes); + if (bytes) copyToShmem16(tid%WARP_SIZE, dst, src, bytes); } __syncthreads(); // publish ncclShmem @@ -184,10 +189,10 @@ __device__ void ncclKernel( } __syncthreads(); - if (ncclShmem.work.header.funcIndex == FnIndex) { - RunWork().run(&ncclShmem.work); + if (0 <= SpecializedFnId && ncclShmem.work.header.funcIndex == (unsigned)SpecializedFnId) { + SpecializedRunWork().run(&ncclShmem.work); } else { - ncclFuncs[ncclShmem.work.header.funcIndex](); + ncclDevFuncTable[ncclShmem.work.header.funcIndex](); } int workIxNext = ncclShmem.work.header.workNext; @@ -204,94 +209,17 @@ __device__ void ncclKernel( } } -// Only generate kernels for SUM -#if NCCL_OP == 0 -#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \ -__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)( \ - struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead \ - ) { \ - ncclKernel, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex> \ - (comm, channelMask, workHead); \ -} -#else -#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fInded) -#endif +__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); +__device__ void ncclDevFunc_Nop(); -// Examples : AllReduce, RING, LL, Sum, uint8 -#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \ -__device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \ - RunWork, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \ -} +#define DEFINE_ncclDevKernel(suffix, coll, redop, ty, algo, proto, specializedFnId) \ + __global__ void ncclDevKernel_##suffix(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \ + ncclKernelMain, algo, proto>>(comm, channelMask, workHead); \ + } -// Only generate inline kernels for LL -#define IMPL_COLL4(func, algo, devredop, type, ncclType) \ - IMPL_COLL_FUNC(func, algo, LL, devredop, type) \ - IMPL_COLL_FUNC(func, algo, LL128, devredop, type) \ - IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type) \ - IMPL_COLL_KERN(func, algo, LL, devredop, type, FUNC_INDEX(ncclFunc##func, ncclDev##devredop, ncclType, NCCL_ALGO_##algo, NCCL_PROTO_LL)) \ - -#define IMPL_COLL3(func, devredop, type, ncclType) \ - IMPL_COLL4(func, TREE, devredop, type, ncclType) \ - IMPL_COLL4(func, RING, devredop, type, ncclType) \ - IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \ - IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType) \ - IMPL_COLL4(func, NVLS, devredop, type, ncclType) \ - IMPL_COLL4(func, NVLS_TREE, devredop, type, ncclType) - -#if NCCL_TYPE == 0 -#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t, ncclInt8) -#elif NCCL_TYPE == 1 -#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint8_t, ncclUint8) -#elif NCCL_TYPE == 2 -#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int32_t, ncclInt32) -#elif NCCL_TYPE == 3 -#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint32_t, ncclUint32) -#elif NCCL_TYPE == 4 -#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int64_t, ncclInt64) -#elif NCCL_TYPE == 5 -#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint64_t, ncclUint64) -#elif NCCL_TYPE == 6 -#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, half, ncclFloat16) -#elif NCCL_TYPE == 7 -#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, float, ncclFloat32) -#elif NCCL_TYPE == 8 -#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, double, ncclFloat64) -#elif NCCL_TYPE == 9 && defined(__CUDA_BF16_TYPES_EXIST__) -#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, __nv_bfloat16, ncclBfloat16) -#endif - -// Reduction define all functions -#if NCCL_OP == 0 -#define IMPL_COLL_R(func) IMPL_COLL2(func, Sum); -#elif NCCL_OP == 1 -#define IMPL_COLL_R(func) IMPL_COLL2(func, Prod); -#elif NCCL_OP == 2 -#define IMPL_COLL_R(func) IMPL_COLL2(func, Min); -#elif NCCL_OP == 3 -#define IMPL_COLL_R(func) IMPL_COLL2(func, Max); -#elif NCCL_OP == 4 -#define IMPL_COLL_R(func) IMPL_COLL2(func, PreMulSum); -#elif NCCL_OP == 5 - #if NCCL_TYPE < 6 - #define IMPL_COLL_R(func) IMPL_COLL2(func, SumPostDiv); - #else - #define IMPL_COLL_R(func) // skip SumPostDiv for floating point - #endif -#endif - -#if NCCL_OP == 0 && NCCL_TYPE == 0 -// Copy primitives only define one function for copy -#define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8); - -// Point-to-point primitives only have one function/kernel. -#define IMPL_COLL_P(func) \ - IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \ - IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P); -#else -#define IMPL_COLL_C(func) -#define IMPL_COLL_P(func) -#endif - -#define NCCL_NVLS_ENABLED (__CUDA_ARCH__ >= 900 && NCCL_NVLS_SUPPORTS(NCCL_TYPE, NCCL_OP)) +#define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \ + __device__ void ncclDevFunc_##suffix() { \ + RunWork, algo, proto>().run(&ncclShmem.work); \ + } #endif diff --git a/src/collectives/device/common_kernel.h b/src/device/common_kernel.h similarity index 97% rename from src/collectives/device/common_kernel.h rename to src/device/common_kernel.h index 6af8da57ea..bfeb87fdf3 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/device/common_kernel.h @@ -7,7 +7,7 @@ #ifndef NCCL_COMMON_KERNEL_H_ #define NCCL_COMMON_KERNEL_H_ -#include "devcomm.h" +#include "device.h" #include "op128.h" #include "reduce_kernel.h" #include @@ -81,13 +81,13 @@ __device__ __forceinline__ void reduceCopyPacks( for (int u=0; u < Unroll; u++) { if (0 < MultimemSrcs) { // applyLoadMultimem uses relaxed semantics for same reason we use volatile below. - acc[u] = applyLoadMultimem(preFn, minSrcs[0]); + acc[u] = applyLoadMultimem(redFn, minSrcs[0]); } else { // Use volatile loads in case credits are polled for with volatile (instead of acquire). acc[u] = ld_volatile_global(minSrcs[0]); + if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]); } minSrcs[0] += WARP_SIZE*BytePerPack; - if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]); } } @@ -99,7 +99,7 @@ __device__ __forceinline__ void reduceCopyPacks( for (int u=0; u < Unroll; u++) { if (s < MultimemSrcs) { // applyLoadMultimem uses relaxed semantics for same reason we use volatile below. - acc[u] = applyLoadMultimem(preFn, minSrcs[s]); + acc[u] = applyLoadMultimem(redFn, minSrcs[s]); } else { // Use volatile loads in case credits are polled for with volatile (instead of acquire). tmp[u] = ld_volatile_global(minSrcs[s]); diff --git a/src/device/generate.py b/src/device/generate.py new file mode 100755 index 0000000000..0b053de17e --- /dev/null +++ b/src/device/generate.py @@ -0,0 +1,405 @@ +#!/usr/bin/env python3 +import os +import sys + +# Order of redops, tys, protos, algos must match src/include/device.h +all_colls = ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"] +all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"] +all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"] +all_protos = ["LL","LL128","SIMPLE"] +all_algos = ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"] + +################################################################################ +# The first command line argument is the path to the directory to generate and +# populate. + +gensrc = sys.argv[1] + +if os.path.exists(gensrc): + for name in os.listdir(gensrc): + os.remove(os.path.join(gensrc, name)) + #os.truncate(os.path.join(gensrc, name), 0) +else: + os.mkdir(gensrc) + +################################################################################ +# The second command line argument is used as a regex to filter the functions +# which make it into libnccl. This is helpful for reducing the binary when +# developing device code. The regex supports non-space containing globs '*', +# parentheses '(x)', and union 'a|b'. The string representing the function has +# one of the forms: +# +# SendRecv +# (AllGather|Broadcast) +# (AlLReduce|Reduce|ReduceScatter) +# +# The possible values for redop, type, algo, proto can be found in the all_ +# lists at the top of this file. +# +# Since the Makefile forwards this from the ONLY_FUNCS variable, useful command +# line examples are given: +""" +# Only send/recv: +make ONLY_FUNCS="SendRecv" + +# Only non-reductions: +make ONLY_FUNCS="AllGather * *|Broadcast * *|SendRecv" + +# Only AllReduce sum f32 (but all algos, protos) +make ONLY_FUNCS="AllReduce Sum f32 * *" + +# Only AllReduce minmax i32 NVLS (but all protos) +make ONLY_FUNCS="AllReduce MinMax i32 NVLS *" + +# AllReduce sum RING LL128 +make ONLY_FUNCS="AllReduce Sum f32 RING LL128" +""" + +# Paste all non-None arguments together with `sep`. +def paste(sep, *args): + return sep.join(x for x in args if x is not None) + +func_pattern = sys.argv[2:3] +if func_pattern and func_pattern[0]: + import re + func_pattern = func_pattern[0] + func_pattern = func_pattern.replace("*", "[^ ]*") + func_pattern += "$" + def func_filter(*fn): + return None is not re.match(func_pattern, paste(" ", *fn), flags=re.IGNORECASE) +else: + def func_filter(coll, redop, ty, algo, proto): + return True + +################################################################################ + +algos_of_coll = { + "AllGather": ["RING","NVLS"], + "AllReduce": all_algos, + "Broadcast": ["RING"], + "Reduce": ["RING"], + "ReduceScatter": ["RING","NVLS"], + "SendRecv": [None] +} + +coll_camel_to_lower = { + "AllGather": "all_gather", + "AllReduce": "all_reduce", + "Broadcast": "broadcast", + "Reduce": "reduce", + "ReduceScatter": "reduce_scatter", + "SendRecv": "sendrecv" +} +coll_lower_to_camel = {coll_camel_to_lower[x]: x for x in coll_camel_to_lower} + +################################################################################ + +# Returns pair of minimum required values for (CUDART_VERSION, __CUDA_ARCH__) +# or None if function is never supported. Note that (0, 0) encodes universal +# support. +def required_cuda(coll, redop, ty, algo, proto): + cudart, arch = 0, 0 + # kernels mapped to by coll="Nop" functions have coll="Generic" + if coll in ("SendRecv", "Generic", "Nop"): return (cudart, arch) + + if proto!="SIMPLE" and algo not in ("RING","TREE"): return None + + if coll in ("AllReduce","Reduce","ReduceScatter"): + if redop=="SumPostDiv" and ty[0] not in ("i","u"): return None + if ty=="bf16": cudart = max(cudart, 11000) + + if "NVLS" in algo: + if coll in ("AllReduce","Reduce","ReduceScatter"): + # Must match ncclNvlsSupported() in src/include/device.h + nvls_ok = ((ty in ("i32","u32","i64","u64") and redop in ("Sum","MinMax")) or + (ty in ("f32","f64") and redop=="Sum") or + (ty in ("f16","bf16") and redop in ("Sum","MinMax"))) + if not nvls_ok: return None + cudart = max(cudart, 12010) + arch = max(arch, 900) + + return (cudart, arch) + +# Maps functions to the chosen representative for the equivalence class it +# belongs to. For instance (sum, signed int) maps to (sum, unsigned int). +def equivalent_primary(coll, redop, ty, algo, proto): + if coll in ("AllReduce", "Reduce", "ReduceScatter"): + # map signed integer sum/prod to unsigned + if redop in ("Sum","Prod","PreMulSum") and ty[0]=="i": + return (coll, redop, "u"+ty[1:], algo, proto) + # map signed integer min/max to unsigned for non-NVLS + if redop=="MinMax" and ty[0]=="i" and ("NVLS" not in algo): + return (coll, redop, "u"+ty[1:], algo, proto) + return (coll, redop, ty, algo, proto) + +# Map to another func representing the best kernel to use. Every distinct value +# returned will instantiate a ncclDevKernel specialized to run this func +# without function call overhead. +def best_kernel(coll, redop, ty, algo, proto): + def best(coll, redop, ty, algo, proto): + # Modify this logic to control how many kernels are specialized. + if coll=="Nop": return ("Generic", None, None, None, None) + if coll=="SendRecv": return ("SendRecv", None, None, None, None) + if coll in ("AllGather","Broadcast"): return (coll, None, None, "RING", "LL") + return (coll, "Sum", ty, ("TREE" if algo=="TREE" else "RING"), "LL") + # Need to ensure kernel is specialize for a primary function + kfn = equivalent_primary(*best(coll, redop, ty, algo, proto)) + # And isn't filtered out. + if not func_filter(*kfn): return ("Generic", None, None, None, None) + return kfn + +# Order rows are enumerated must match formula of `ncclDevFuncId()`: +def enumerate_func_rows(): + yield ("SendRecv", None, None, None, None) + for coll in ("AllGather", "Broadcast"): + algos = algos_of_coll[coll] + for algo in algos: + for proto in all_protos: + yield (coll, None, None, algo, proto) + for coll in ("AllReduce", "Reduce", "ReduceScatter"): + algos = algos_of_coll[coll] + for redop in all_redops: + for ty in all_tys: + for algo in algos: + for proto in all_protos: + yield (coll, redop, ty, algo, proto) + +################################################################################ + +def is_built(coll, redop, ty, algo, proto): + built = required_cuda(coll, redop, ty, algo, proto) + built = built and func_filter(coll, redop, ty, algo, proto) + return built + +# Returns None if required_cuda(...) is None. +# Returns the coll="Nop" function if developer has filtered it out. +# Otherwise just returns func it was given. +def validate(coll, redop, ty, algo, proto): + valid = required_cuda(coll, redop, ty, algo, proto) + built = valid and func_filter(coll, redop, ty, algo, proto) + if built: return (coll, redop, ty, algo, proto) + if valid: return ("Nop", None, None, None, None) + return None + +# Corresponds to ncclDevFuncRowToId[] +func_rows = [validate(*fn) for fn in enumerate_func_rows()] + +# Corresponds to ncclDevFuncTable[] +primary_funcs = sorted(set(equivalent_primary(*fn) for fn in func_rows if fn is not None)) + +# primary_to_index[primary_funcs[i]] == i +primary_to_index = {fn: i for (i,fn) in zip(range(len(primary_funcs)), primary_funcs)} + +kernel_funcs = sorted(set(best_kernel(*fn) for fn in primary_funcs)) + +################################################################################ + +# Generate /device_table.cu +with open(os.path.join(gensrc, "device_table.cu"), "w") as f: + out = f.write + out('#include "common.h"\n') + out("\n") + + for fn in primary_funcs: + sym = paste("_", "ncclDevFunc", *fn) + cudart, arch = required_cuda(*fn) + if (cudart, arch) != (0, 0): + out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch)) + out("__device__ void %s();\n" % sym) + if (cudart, arch) != (0, 0): + out("#endif\n") + out("\n") + + out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n"); + index = 0 + for fn in primary_funcs: + sym = paste("_", "ncclDevFunc", *fn) + cudart, arch = required_cuda(*fn) + if (cudart, arch) != (0, 0): + out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart ,arch)) + out("/*%4d*/ %s,\n" % (index, sym)) + if (cudart, arch) != (0, 0): + out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index) + index += 1 + out("nullptr};\n") + out("\n") + + out("// Workaround for https://reviews.llvm.org/D55580\n" + "__device__ void ncclWorkaroundClangD55580() {}\n") + +# Generate /host_table.cc +with open(os.path.join(gensrc, "host_table.cc"), "w") as f: + out = f.write + out('#include "device.h"\n') + out("\n") + + # The mapping from function rows to valid primary function ids. + out("extern int const ncclDevFuncRowToId[] = {\n") + index = 0 + for fn in func_rows: + fn_id, comment = -1, "" + if fn is not None: + fn_id = primary_to_index[equivalent_primary(*fn)] + comment = " // " + paste(" ", *fn) + out("/*%4d*/ %d,%s\n" % (index, fn_id, comment)) + index += 1 + out("-1};\n") + out("\n") + + # Forward declarations of kernels. + for kfn in kernel_funcs: + cudart, _ = required_cuda(*kfn) + sym = paste("_", "ncclDevKernel", *kfn) + if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart) + out("__global__ void %s(struct ncclDevComm*, uint64_t, struct ncclWork*);\n" % sym) + if cudart != 0: out("#endif\n") + out("\n") + + # List of all kernel function pointers. + out("extern int const ncclDevKernelCount = %d;\n" % len(kernel_funcs)) + out("extern void* const ncclDevKernelList[] = {\n") + index = 0 + for kfn in kernel_funcs: + cudart, _ = required_cuda(*kfn) + sym = paste("_", "ncclDevKernel", *kfn) + if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart) + out("/*%4d*/ (void*)%s,\n" % (index, sym)); + if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index) + index += 1 + out("nullptr};\n") + out("\n") + + # Maps primary id to kernel function pointer. + out("extern void* const ncclDevKernelForFunc[] = {\n") + index = 0 + for fn in primary_funcs: + kfn = best_kernel(*fn) + sym = paste("_", "ncclDevKernel", *kfn) + cudart, _ = required_cuda(*kfn) + if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart) + out("/*%4d*/ (void*)%s,\n" % (index, sym)) + if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index) + index += 1 + out("nullptr};\n") + out("\n") + + # Does the prior map use an explicitly specialized kernel. + out("extern bool const ncclDevKernelForFuncIsSpecialized[] = {\n") + index = 0 + for fn in primary_funcs: + kfn = best_kernel(*fn) + specialized = "1" if fn == kfn else "0" + out("/*%4d*/ %s,\n" % (index, specialized)) + index += 1 + out("0};\n") + +# Maps to .cu filename which implements this func. The only constraint is that +# "coll" is reflected in the name: formally that no two funcs having different +# coll's map to the same filename. +def impl_filename(coll, redop, ty, algo, proto): + return "%s.cu" % paste("_", coll_camel_to_lower[coll], redop and redop.lower(), ty) + +# Partition the functions and kernels to the .cu filenames. The partition is +# a dictionary mapping filename to (coll, func-tuple list) +def partition_by_name(fns): + ans = {} + for fn in fns: + name = impl_filename(*fn) + coll = fn[0] + if name not in ans: + ans[name] = (coll, []) + ans[name][1].append(fn) + return ans + +name_to_funcs = partition_by_name(fn for fn in primary_funcs if fn[0]!="Nop") +name_to_kernels = partition_by_name(kfn for kfn in kernel_funcs if kfn[0]!="Generic") + +# Generate /rules.mk +with open(os.path.join(gensrc, "rules.mk"), "w") as f: + out = f.write + impl_names = sorted(name_to_funcs.keys()) + names = impl_names + ["host_table.cc", "device_table.cu"] + out("LIB_OBJS_GEN = $(patsubst %, $(OBJDIR)/genobj/%.o, {names})\n" + .format(names=" ".join(names))) + out("\n") + + # For each __.cu compile to a .cu.o file. Notice the dependencies + # come from the suffix-erased file (e.g. 'gensrc/all_reduce.cu') + for name in impl_names: + coll = name_to_funcs[name][0] + out( + "$(OBJDIR)/genobj/{name}.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/{lower_coll}.cu.d\n" + "\t" "$(call COMPILE,$@,$(OBJDIR)/gensrc/{name})\n" + "\n" + .format(name=name, lower_coll=coll_camel_to_lower[coll]) + ) + +# Add the suffix-erased .cu's which are used only for dependency scraping. +for coll in set(coll for (coll,_,_,_,_) in primary_funcs if coll!="Nop"): + name = impl_filename(coll, None, None, None, None) + if name not in name_to_funcs: + name_to_funcs[name] = (coll, []) + +redop_to_cxx = { + None: "FuncCopy", + "Sum": "FuncSum", + "Prod": "FuncProd", + "MinMax": "FuncMinMax", + "PreMulSum": "FuncPreMulSum", + "SumPostDiv": "FuncSumPostDiv" +} + +ty_to_cxx = { + None: "int8_t", + "i8": "int8_t", + "u8": "uint8_t", + "i32": "int32_t", + "u32": "uint32_t", + "i64": "int64_t", + "u64": "uint64_t", + "f16": "half", + "f32": "float", + "f64": "double", + "bf16": "__nv_bfloat16" +} + +# Generate each /.cu: +for name in name_to_funcs.keys(): + (coll, fns) = name_to_funcs[name] + with open(os.path.join(gensrc, name), "w") as f: + out = f.write + out( + '#include "common.h"\n' + '#include "{lower_coll}.h"\n' + .format(lower_coll=coll_camel_to_lower[coll]) + ) + + (_, kfns) = name_to_kernels.get(name) or (None, []) + for kfn in kfns: + (coll, redop, ty, algo, proto) = kfn + sym = paste("_", coll, redop, ty, algo, proto) + fn_id = primary_to_index[kfn] + cudart, arch = required_cuda(*kfn) + if (cudart, arch) != (0, 0): + out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch)) + out( + "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n" + .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty], + algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id) + ) + if (cudart, arch) != (0, 0): + out("#endif\n") + + for fn in fns: + (coll, redop, ty, algo, proto) = fn + sym = paste("_", coll, redop, ty, algo, proto) + cudart, arch = required_cuda(*fn) + if (cudart, arch) != (0, 0): + out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch)) + out( + "DEFINE_ncclDevFunc({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto})\n" + .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty], + algo=(algo or "RING"), proto=(proto or "SIMPLE")) + ) + if (cudart, arch) != (0, 0): + out("#endif\n") diff --git a/src/device/network/unpack/unpack.h b/src/device/network/unpack/unpack.h new file mode 100644 index 0000000000..3bc910047d --- /dev/null +++ b/src/device/network/unpack/unpack.h @@ -0,0 +1,280 @@ +/************************************************************************* + * Copyright (c) 2023, Google LLC. All rights reserved. + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef NET_DEVICE_UNPACK_H +#define NET_DEVICE_UNPACK_H + +#include "unpack_defs.h" + +#include "op128.h" +#include "align.h" +#include "device.h" +#include "common.h" + +// #define ALIGNED_LOAD + +inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) { + #if __CUDA_ARCH__ >= 700 + asm volatile("ld.relaxed.gpu.u64 {%0}, [%1];" + : "=l"(v) : "l"(ptr)); + #else + asm volatile("ld.volatile.global.u64 {%0}, [%1];" + : "=l"(v) : "l"(ptr)); + #endif +} + +#define PAGE_META_SIZE 16 +#define META_LOAD_SIZE 16 +#define DATA_LOAD_SIZE 16 + +// Map internal association of handle with group and peer index (called once at init time) +inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group, const int index) { + struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle; + ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta; + ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf; + ncclShmem.groups[group].devicePlugin.unpack.head = handle->head; +} + +inline __device__ void ncclNetDeviceIncrementHead(const int group) { + ncclShmem.groups[group].devicePlugin.unpack.head++; +} + +inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group) { + struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle; + handle->head = ncclShmem.groups[group].devicePlugin.unpack.head; +} + +template +inline __device__ void bulkLoad(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack *reg, const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){ + bulkLoad<1>(t, len, cpy_src, cpy_dst, reg, w, g_meta, s_meta, src_off, dst_off); +} + +template <> +inline __device__ void bulkLoad<1>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<1> reg[16], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){ + uint64_t data_s; + for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) { + +#ifdef ALIGNED_LOAD + load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]); +#else +#pragma unroll + for (int i=0; i<16; i++) { + reg[i] = ld_volatile_global<1>((uintptr_t)((uint8_t*)(cpy_src + data_s) + i)); + } +#endif + +#pragma unroll + for (int i=0; i<16; i++) { + st_global<1>((uintptr_t)((uint8_t*)(cpy_dst + data_s) + i), reg[i]); + } + } +} + +template <> +inline __device__ void bulkLoad<2>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<2> reg[8], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){ + uint64_t data_s; + for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) { +#ifdef ALIGNED_LOAD + load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]); +#else +#pragma unroll + for (int i=0; i<8; i++) { + reg[i] = ld_volatile_global<2>((uintptr_t)((uint16_t*)(cpy_src + data_s) + i)); + } +#endif + + +#pragma unroll + for (int i=0; i<8; i++) { + st_global<2>((uintptr_t)((uint16_t*)(cpy_dst + data_s) + i), reg[i]); + } + } +} + +template <> +inline __device__ void bulkLoad<4>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<4> reg[4], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){ + uint64_t data_s; + for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) { +#ifdef ALIGNED_LOAD + load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]); +#else +#pragma unroll + for (int i=0; i<4; i++) { + reg[i] = ld_volatile_global<4>((uintptr_t)((uint32_t *)(cpy_src + data_s) + i)); + } +#endif + +#pragma unroll + for (int i=0; i<4; i++) { + st_global<4>((uintptr_t)((uint32_t*)(cpy_dst + data_s) + i), reg[i]); + } + } +} + +template <> +inline __device__ void bulkLoad<8>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<8> reg[2], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){ + uint64_t data_s; + for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) { +#ifdef ALIGNED_LOAD + load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]); +#else +#pragma unroll + for (int i=0; i<2; i++) { + reg[i] = ld_volatile_global<8>((uintptr_t)((uint64_t*)(cpy_src + data_s) + i)); + } +#endif + +#pragma unroll + for (int i=0; i<2; i++) { + st_global<8>((uintptr_t)((uint64_t*)(cpy_dst + data_s) + i), reg[i]); + } + } +} + +template <> +inline __device__ void bulkLoad<16>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<16> reg[1], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){ + uint64_t data_s; + for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) { + reg[0] = ld_volatile_global<16>((uintptr_t)(cpy_src + data_s)); + st_global<16>((uintptr_t)(cpy_dst + data_s), reg[0]); + } +} + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif +inline __device__ int ppw(const int nbytes, int nw) { + int v = DIVUP(nbytes, SLICE_PAGE_SIZE); + v = DIVUP(v, nw); + while (v > WARP_SHM_PAGE_CNT) { + v = DIVUP(v, 2); + } + return v; +} + +// This function is called by all threads +// Pack data from the internal iovec to the supplied flat buffer using all the +// threads +template +inline __device__ void ncclNetDeviceUnpack( + const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize); + +template <> +inline __device__ void ncclNetDeviceUnpack( + const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize) { + // send unpack empty +} + +inline __device__ void ncclNetDeviceUnpackInner( + const int tid, const int tidInBlock, const int nworkers, const int group, const int index, + void *src, const int nbytes, const uint64_t step); + +template <> +inline __device__ void ncclNetDeviceUnpack( + const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize) { + + while (mask != 0) { + int ix = __ffs(mask)-1; // Get the first set bit of the mask (this should correlate to a peer index) + mask &= mask-1; // Drop the first set bit of the mask + + // Pack data from the internal iovec to the supplied flat srcs buffer using all the threads + // + Src is necessary in the case of accessing the user buffer directly + ncclNetDeviceUnpackInner(tid, tidInBlock, nworkers, group /* in case they need to use split warps shared memory partitioning*/, + ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head); + } +} + +inline __device__ void ncclNetDeviceUnpackInner( + const int tid, const int tidInBlock, const int nworkers, const int group, const int index, + void *src, const int nbytes, const uint64_t step) { + // from src/collectives/device/common_kernel.h + const int w = tid / WARP_SIZE; // Warp number + const int nw = nworkers / WARP_SIZE; // Number of warps + const int t = tid % WARP_SIZE; // Thread (inside the warp) + + BytePack<16> reg; + loadMeta meta; + + uint64_t head; + struct netUnpackMeta* g_meta_struct; + void* bounce_buf; + + loadMeta* g_meta; + loadMeta* s_meta; + uint64_t meta_cnt; + + // hack head use per-warp + head = step; + g_meta_struct = ncclShmem.groups[group].devicePlugin.unpack.g_meta[index]; + bounce_buf = ncclShmem.devicePlugin.unpack.bounce_buf; + + __syncwarp(); + + head %= NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH; + + g_meta = g_meta_struct->mem[head]; + + // Currently, even/odd groups perform send/recv separately. We don't really need space for send side. + // Total size is N page per warp * 16 B per page * 20 WARPS max = 320 * N bytes, N == WARP_SHM_PAGE_CNT + static_assert(ncclShmemScratchWarpSize() >= WARP_SHM_SIZE, "Each warp must have enough scratch space"); + s_meta = (loadMeta*) ncclScratchForWarp(tidInBlock / WARP_SIZE); // (loadMeta*) (ncclShmem.devicePlugin.unpack.meta + shm_off); + + load64gpu(g_meta_struct->cnt + head, meta_cnt); + + int PPW = ppw(nbytes, nw); + + for (uint64_t meta_s = w * PPW; meta_s < meta_cnt; meta_s += nw * PPW) { + + uint64_t iter_meta_cnt = meta_cnt - meta_s; + iter_meta_cnt = iter_meta_cnt < PPW ? iter_meta_cnt : PPW; + + // TODO: this load size needs to work if not aligned, but since the two are both 16... + if (t < PPW * PAGE_META_SIZE / META_LOAD_SIZE && t < iter_meta_cnt) { // avoid last iter load garbage data + load128((const uint64_t*) (g_meta + (meta_s + t)), reg.u64[0], reg.u64[1]); + + storeShmem128(shmemCvtPtr((uint64_t *)(s_meta + (w * PPW + t))), reg.u64[0], reg.u64[1]); + } + + __syncwarp(); + + for (int x = 0; x < iter_meta_cnt; x++) { + int meta_idx = x + w * PPW; + + // load page offs + loadShmem128(shmemCvtPtr((uint64_t*) (s_meta + meta_idx)), meta.r64[0], meta.r64[1]); + + if (meta.len >= DATA_LOAD_SIZE) { + // fast path, but need to adapt to alignment issue + + // bulk copy data + uint8_t align_off = (meta.src_off | meta.dst_off) % DATA_LOAD_SIZE; + align_off = align_off & -align_off; // keep the lowest bit + if (align_off == 0) { // 0x16 + bulkLoad<16>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, ®, w, g_meta, s_meta, meta.src_off, meta.dst_off); + } else if (align_off & 0x8) { + bulkLoad<8>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<8>*) ®, w, g_meta, s_meta, meta.src_off, meta.dst_off); + } else if (align_off & 0x4) { + bulkLoad<4>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<4>*) ®, w, g_meta, s_meta, meta.src_off, meta.dst_off); + } else if (align_off & 0x2) { + bulkLoad<2>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<2>*) ®, w, g_meta, s_meta, meta.src_off, meta.dst_off); + } else { // if (align_off & 0x1) + bulkLoad<1>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<1>*) ®, w, g_meta, s_meta, meta.src_off, meta.dst_off); + } + } + + // must be less than 16 bytes + if (t < meta.len % DATA_LOAD_SIZE) { + volatile char* cpy_src = (char*) bounce_buf + meta.src_off + (meta.len / DATA_LOAD_SIZE) * DATA_LOAD_SIZE + t; + volatile char* cpy_dst = (char*) src + meta.dst_off + (meta.len / DATA_LOAD_SIZE) * DATA_LOAD_SIZE + t; + *cpy_dst = *cpy_src; + } + } + + __syncwarp(); + } +} + +#endif // NET_DEVICE_UNPACK_DEFS_H_ diff --git a/src/device/network/unpack/unpack_defs.h b/src/device/network/unpack/unpack_defs.h new file mode 100644 index 0000000000..9be1c5e424 --- /dev/null +++ b/src/device/network/unpack/unpack_defs.h @@ -0,0 +1,61 @@ +/************************************************************************* + * Copyright (c) 2023, Google LLC. All rights reserved. + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef NET_DEVICE_UNPACK_DEFS_H +#define NET_DEVICE_UNPACK_DEFS_H + +#include + +#include "device.h" + +#define NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH 16 + +union alignas(16) loadMeta { + uint64_t r64[2]; + struct { + uint32_t src_off; + uint32_t len; + uint64_t dst_off; + }; +}; +static_assert(sizeof(union loadMeta) == 16, "Must be 16-byte aligned"); + +/****** global memory ******/ + +#define NET_UNPACK_MAX_QUEUE_DEPTH 16 // MAX_REQUESTS +#define NET_UNPACK_MAX_SLICE_SIZE 4194304 // 4MB per Irecv call +#define SLICE_PAGE_SIZE 4096 +#define NET_UNPACK_MAX_SLICE_PAGES \ + (NET_UNPACK_MAX_SLICE_SIZE / SLICE_PAGE_SIZE * 2) // * 2 for slack, wasteful.. + +struct netUnpackMeta { + loadMeta mem[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH][NET_UNPACK_MAX_SLICE_PAGES]; + uint64_t cnt[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH]; +}; + +struct unpackNetDeviceHandle { + struct netUnpackMeta *meta; // mapped + void* bounce_buf; + uint64_t head; +}; + +/****** shared memory ******/ + +#define NET_UNPACK_MAX_GROUPS 16 // Forked from NCCL_MAX_GROUPS in devcomm.h +#define NET_UNPACK_MAX_NPEERS 2 // The most you should have is 2 network peers per-group (indexed by index) +#define WARP_SHM_PAGE_CNT 4 +#define WARP_SHM_SIZE (WARP_SHM_PAGE_CNT * sizeof(union loadMeta)) +struct unpackShmem { + void* bounce_buf; +}; + +struct unpackGroupShmem { + int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv + uint64_t head; + struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy +}; + +#endif // NET_DEVICE_UNPACK_DEFS_H_ diff --git a/src/device/onerank.cu b/src/device/onerank.cu new file mode 100644 index 0000000000..5ff4a85b10 --- /dev/null +++ b/src/device/onerank.cu @@ -0,0 +1,79 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "alloc.h" +#include "collectives.h" +#include "common_kernel.h" +#include "common.h" +#include + +namespace { + template + __global__ __launch_bounds__(512, 1) + void oneRankReduce(void* dst, void* src, size_t nElts, uint64_t redOpArg, bool redOpArgIsPtr) { + using T = typename RedOp::EltType; + int tid = threadIdx.x; + int tn = blockDim.x; + int bid = blockIdx.x; + int bn = gridDim.x; + + // each block/channel gets a roughly equal segment of 16 byte packs + constexpr int EltPerPack = 16/sizeof(T); + intptr_t i0 = (bid+0)*alignUp(nElts/bn, EltPerPack); + intptr_t i1 = (bid+1)*alignUp(nElts/bn, EltPerPack); + i0 = min(i0, nElts); + i1 = min(i1, nElts); + src = (T*)src + i0; + dst = (T*)dst + i0; + + if (redOpArgIsPtr) { + if (redOpArg%2 != 0) { + redOpArg = *reinterpret_cast(redOpArg); + } else if (redOpArg%4 != 0) { + redOpArg = *reinterpret_cast(redOpArg); + } else if (redOpArg%8 != 0) { + redOpArg = *reinterpret_cast(redOpArg); + } else { + redOpArg = *reinterpret_cast(redOpArg); + } + } + reduceCopy + (tid, tn, redOpArg, &redOpArg, true, 1, &src, 1, &dst, i1-i0); + } +} + +ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t eltType, cudaStream_t stream) { + size_t eltSize = ncclTypeSize(eltType); + if (redOp.op != ncclDevPreMulSum) { + if (dst != src) { + NCCLCHECK(ncclCudaMemcpyAsync((char*)dst, (char*)src, nElts*eltSize, stream)); + } + return ncclSuccess; + } + + void const* kernel; + switch (eltType) { + case ncclInt8: kernel = (void const*)&oneRankReduce>; break; + case ncclUint8: kernel = (void const*)&oneRankReduce>; break; + case ncclInt32: kernel = (void const*)&oneRankReduce>; break; + case ncclUint32: kernel = (void const*)&oneRankReduce>; break; + case ncclInt64: kernel = (void const*)&oneRankReduce>; break; + case ncclUint64: kernel = (void const*)&oneRankReduce>; break; + case ncclFloat16: kernel = (void const*)&oneRankReduce>; break; + #if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: kernel = (void const*)&oneRankReduce>; break; + #endif + case ncclFloat32: kernel = (void const*)&oneRankReduce>; break; + case ncclFloat64: kernel = (void const*)&oneRankReduce>; break; + default: return ncclInvalidArgument; + } + dim3 grid = {0, 1, 1}; + grid.x = std::min(32, (int)divUp(nElts*eltSize, 16<<10)); + dim3 block = {512, 1, 1}; + void* args[5] = {&dst, &src, &nElts, &redOp.scalarArg, &redOp.scalarArgIsPtr}; + CUDACHECK(cudaLaunchKernel(kernel, grid, block, args, 0, stream)); + return ncclSuccess; +} diff --git a/src/collectives/device/op128.h b/src/device/op128.h similarity index 82% rename from src/collectives/device/op128.h rename to src/device/op128.h index 8ce18ef600..b2f8227b05 100644 --- a/src/collectives/device/op128.h +++ b/src/device/op128.h @@ -161,21 +161,25 @@ __device__ __forceinline__ T fromPack(typename BytePackOf::Pack pack) { // Load/store of BytePack using integral addresses. template __device__ BytePack ld_global(uintptr_t addr); -template __device__ BytePack ld_volatile_global(uintptr_t addr); template __device__ BytePack ld_shared(uint32_t addr); +template __device__ BytePack ld_volatile_global(uintptr_t addr); template __device__ BytePack ld_volatile_shared(uint32_t addr); +template __device__ BytePack ld_relaxed_gpu_global(uintptr_t addr); template __device__ void st_global(uintptr_t addr, BytePack value); template __device__ void st_shared(uint32_t addr, BytePack value); +template __device__ void st_relaxed_gpu_global(uintptr_t addr, BytePack value); template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; } -template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; } +template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; } +template<> __device__ __forceinline__ BytePack<0> ld_relaxed_gpu_global<0>(uintptr_t addr) { return {}; } template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {} template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {} +template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t addr, BytePack<0> value) {} // Used to define implementations for above prototypes. -#define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \ +#define DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \ template<> \ __device__ __forceinline__ BytePack ld_##space(addr_cxx_ty addr) { \ data_cxx_ty tmp; \ @@ -197,19 +201,44 @@ template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack< data_cxx_ty tmp = value.native; \ asm volatile("st." #space "." #data_ptx_ty " [%0], %1;" :: #addr_reg_ty(addr), #data_reg_ty(tmp) : "memory"); \ } + +#if __CUDA_ARCH__ >= 700 + #define PTX_relaxed_gpu "relaxed.gpu" +#else + #define PTX_relaxed_gpu "volatile" +#endif + +#define DEFINE_ld_st_gpu_relaxed__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) \ + template<> \ + __device__ __forceinline__ BytePack ld_relaxed_gpu_global(uintptr_t addr) { \ + data_cxx_ty tmp; \ + asm("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr)); \ + BytePack ans; \ + ans.native = tmp; \ + return ans; \ + } \ + template<> \ + __device__ __forceinline__ void st_relaxed_gpu_global(uintptr_t addr, BytePack value) { \ + data_cxx_ty tmp = value.native; \ + asm volatile("st." PTX_relaxed_gpu ".global." #data_ptx_ty " [%0], %1;" :: "l"(addr), #data_reg_ty(tmp) : "memory"); \ + } + +#define DEFINE_ld_st__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) \ + DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, global, uintptr_t, l) \ + DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, shared, uint32_t, r) \ + DEFINE_ld_st_gpu_relaxed__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) + // Single-byte types use 4-byte registers since there is no 1-byte register // character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints -DEFINE_ld_st(1, uint32_t, b8, r, global, uintptr_t, l) -DEFINE_ld_st(1, uint32_t, b8, r, shared, uint32_t, r) -DEFINE_ld_st(2, uint16_t, b16, h, global, uintptr_t, l) -DEFINE_ld_st(2, uint16_t, b16, h, shared, uint32_t, r) -DEFINE_ld_st(4, uint32_t, b32, r, global, uintptr_t, l) -DEFINE_ld_st(4, uint32_t, b32, r, shared, uint32_t, r) -DEFINE_ld_st(8, uint64_t, b64, l, global, uintptr_t, l) -DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r) -#undef DEFINE_ld_st +DEFINE_ld_st__size(1, uint32_t, b8, r) +DEFINE_ld_st__size(2, uint16_t, b16, h) +DEFINE_ld_st__size(4, uint32_t, b32, r) +DEFINE_ld_st__size(8, uint64_t, b64, l) -#define DEFINE_ld_st_16(space, addr_cxx_ty, addr_reg_ty) \ +#undef DEFINE_ld_st__size_space +#undef DEFINE_ld_st__size + +#define DEFINE_ld_st_16__space(space, addr_cxx_ty, addr_reg_ty) \ template<> \ __device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \ BytePack<16> ans; \ @@ -226,10 +255,23 @@ DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r) __device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \ asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \ } -DEFINE_ld_st_16(global, uintptr_t, l) -DEFINE_ld_st_16(shared, uint32_t, r) +DEFINE_ld_st_16__space(global, uintptr_t, l) +DEFINE_ld_st_16__space(shared, uint32_t, r) #undef DEFINE_ld_st_16 +template<> +__device__ __forceinline__ BytePack<16> ld_relaxed_gpu_global<16>(uintptr_t addr) { + BytePack<16> ans; + asm("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr)); + return ans; +} +template<> +__device__ __forceinline__ void st_relaxed_gpu_global<16>(uintptr_t addr, BytePack<16> value) { + asm volatile("st." PTX_relaxed_gpu ".global.v2.b64 [%0], {%1,%2};" :: "l"(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); +} + +#undef PTX_relaxed_gpu + //////////////////////////////////////////////////////////////////////////////// // Atomic load/store using c++ pointers. @@ -247,6 +289,15 @@ __device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) { #endif return ans; } +__device__ __forceinline__ uint64_t ld_relaxed_gpu_global(uint64_t *ptr) { + uint64_t ans; + #if __CUDA_ARCH__ >= 700 + asm("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); + #else + asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); + #endif + return ans; +} __device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) { uint64_t ans; #if __CUDA_ARCH__ >= 700 diff --git a/src/collectives/device/primitives.h b/src/device/primitives.h similarity index 100% rename from src/collectives/device/primitives.h rename to src/device/primitives.h diff --git a/src/collectives/device/prims_ll.h b/src/device/prims_ll.h similarity index 99% rename from src/collectives/device/prims_ll.h rename to src/device/prims_ll.h index 5389cc4fae..f341d6fb81 100644 --- a/src/collectives/device/prims_ll.h +++ b/src/device/prims_ll.h @@ -323,7 +323,7 @@ class Primitives: __device__ Primitives( const int tid, const int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, - uint8_t connIndexRecv=0, uint8_t connIndexSend=0 + uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0 ): redOp(redOpArg), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group), diff --git a/src/collectives/device/prims_ll128.h b/src/device/prims_ll128.h similarity index 99% rename from src/collectives/device/prims_ll128.h rename to src/device/prims_ll128.h index cd50942bbe..43e01c485d 100644 --- a/src/collectives/device/prims_ll128.h +++ b/src/device/prims_ll128.h @@ -364,7 +364,7 @@ public: __device__ Primitives( const int tid, const int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, - uint8_t connIndexRecv=0, uint8_t connIndexSend=0 + uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0 ): redOp(redOpArg), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), diff --git a/src/collectives/device/prims_simple.h b/src/device/prims_simple.h similarity index 79% rename from src/collectives/device/prims_simple.h rename to src/device/prims_simple.h index 19cecf97b8..048052eef1 100644 --- a/src/collectives/device/prims_simple.h +++ b/src/device/prims_simple.h @@ -4,6 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ +#include "network/unpack/unpack.h" + template class Primitives< @@ -23,7 +25,11 @@ class Primitives< DirectWrite = 0x200, DirectRead = 0x400, ThreadsSynced = 0x800, - NvlsMinPolling = 0x1000; + NvlsMinPolling = 0x1000, + NetDeviceUnpack = 0x2000, + AnyNetDeviceUnpack = 0x4000, + NvlsDirectRead = 0x8000, + NvlsDirectWrite = 0x10000; const int tid, tidInBlock; const int nthreads; int nworkers; @@ -44,6 +50,8 @@ class Primitives< }; uint64_t *connStepPtr; uint64_t connStepCache; // Cache last seen value of (*connStepPtr) + void* mhandle; + void* netDeviceHandle; // Don't use barrier 0 as it's used by the final sync __device__ void barrier() { @@ -141,7 +149,7 @@ class Primitives< if (flags & OffsFifoEnabled) ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T); else if (isSendNotRecv && DirectSend) { - if (flags & DirectWrite) { + if (flags & (DirectWrite | NvlsDirectWrite)) { ptrs[index] = directBuff + dstIx + offset; } else if (flags & DirectRead) { // empty send ptrs[index] = nullptr; @@ -149,7 +157,7 @@ class Primitives< ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; } } else if (!isSendNotRecv && DirectRecv) { - if (flags & DirectRead) { + if (flags & (DirectRead | NvlsDirectRead)) { ptrs[index] = directBuff + srcIx + offset; } else if (flags & DirectWrite) { ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer @@ -160,6 +168,9 @@ class Primitives< else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; } + if ((flags & (AnyNetDeviceUnpack)) && (flags & (Recv*RoleWaitRecv))) { + ncclNetDeviceIncrementHead(group); + } step += StepPerSlice; } } @@ -229,7 +240,16 @@ class Primitives< /* if user abort the kernel, we don't need to actually perform copy/reduce; just set size * to 0 to avoid unnecessary workload. */ int workSize = ncclShmem.aborted ? 0 : sliceSize; - if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) { + if (flags & AnyNetDeviceUnpack) { + ncclNetDeviceUnpack(tid, tidInBlock, nworkers, group, ncclShmem.groups[group].devicePlugin.unpack.unpackNetDeviceIndexMask, Src, workSize); + // Sync here to make sure all workers are reading from the updated srcs) + subBarrier(); + } + + if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0] + /* NVLS can have srcs[0] == dsts[0], but we cannot enter this "if branch", + * so we need to check whether MultimemSrcs and MultimemDsts are 0. */ + && MultimemSrcs == 0 && MultimemDsts == 0) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy if (Send) { reduceCopy @@ -286,7 +306,7 @@ class Primitives< // shift: peer offset to avoid all ranks sending to or receiving from same peer template __device__ __forceinline__ void - ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp) { + ScatterGatherOp(intptr_t inpIx, intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift, bool postOp) { constexpr int DirectRecv = 1 && Direct && DirectRecv1; constexpr int DirectSend = 1 && Direct && DirectSend1; int offset = 0; // slice offset @@ -295,7 +315,7 @@ class Primitives< #pragma unroll for (int slice=0; slice= 0 && i >= skip) pOffset += peerElem; void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset; - int realPeerSize = min(realSize, totalElem-pOffset); + ssize_t realPeerSize = min(realSize, totalElem-pOffset); if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) { reduceCopy(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize); // Mark for threadfence at the end @@ -322,10 +342,10 @@ class Primitives< } } else if (Recv) { if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset; - int pOffset = index*peerOffset; + ssize_t pOffset = index*peerOffset; if (skip >= 0 && index >= skip) pOffset += peerElem; // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer - waitPeer(outIx, outIx+pOffset, offset, realSize); + waitPeer(outIx+pOffset, outIx+pOffset, offset, realSize); subBarrier(); #pragma unroll for (int j=0; j= 0 && i >= skip) pOffset += peerElem; void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset; - int realPeerSize = min(realSize, totalElem-pOffset); + ssize_t realPeerSize = min(realSize, totalElem-pOffset); if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0; if (realPeerSize > 0) reduceCopy(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize); } @@ -348,6 +368,13 @@ class Primitives< __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) { if (flags & (RoleWaitRecv|RolePostRecv)) { auto *conn = &peer->recv[connIndex]; + if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) { + // handle must be a device ptr + netDeviceHandle = conn->netDeviceHandle.handle; + // Cache the handle + ncclNetDeviceUnpackSetup(netDeviceHandle, group, index); + flags |= NetDeviceUnpack; + } step = conn->step; step = roundUp(step, SlicePerChunk*StepPerSlice); if (flags & RolePostRecv) { @@ -377,6 +404,9 @@ class Primitives< // otherwise, in one-to-multi send, we could mix empty send and intermediate send flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0; } + } else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) { + /* NVLS direct */ + flags |= NvlsDirectRead; } } if (flags & OffsFifoEnabled) @@ -393,6 +423,7 @@ class Primitives< step = roundUp(step, SlicePerChunk*StepPerSlice); if (flags & RolePostSend) { connStepPtr = conn->tail; + connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; } if (flags & RoleWaitSend) { ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs() @@ -424,6 +455,9 @@ class Primitives< // otherwise, in one-to-multi send, we could mix empty send and intermediate send flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0; } + } else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) { + /* NVLS direct */ + flags |= NvlsDirectWrite; } } } @@ -434,10 +468,10 @@ class Primitives< __device__ Primitives( int tid, int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, - uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr + uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, int stepSize_=0 ): tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group), - stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) { + stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) { // For send operations, we need an extra warp to overlap the threadfence and the copy this->nworkers = nthreads - (MaxSend > 0 && nthreads-WARP_SIZE >= 64 ? WARP_SIZE : 0); @@ -473,6 +507,20 @@ class Primitives< loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e); loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e); + if (barrierAny(flags & NetDeviceUnpack)) { + flags |= AnyNetDeviceUnpack; + // g == 0 is the first ThreadPerSync # of threads of this warp + // g == 0 is also the RoleWaitRecv threads of this group, thus the thread ID will correlate to the peer index + if (g == 0) { + uint32_t mask = __ballot_sync((1U << ThreadPerSync) - 1, (flags & NetDeviceUnpack) ? 1 : 0); + + // We only want to update the shared memory variable with a single thread + if (tid == 0) { + ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask; + } + } + } + setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e); } @@ -485,8 +533,10 @@ class Primitives< auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns; conns[index]->step = step; } - // Make sure all threads are done writing back conn->step and done using - // ncclShmem.groups[group] + + if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) { + ncclNetDeviceSaveHead(netDeviceHandle, group); + } barrier(); } @@ -497,33 +547,41 @@ class Primitives< } if (flags & RoleOutput) userBuff = (T*)outputBuf; bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite); - bool sendAcceptor = flags == (flags|RoleWaitSend|DirectWrite); + bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite)); bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched) - bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead); // receiver accepts direct buffer + bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer int regUsed = e != nullptr ? e->elem.regUsed : 0; if (Direct && recvProvider) { int spins = 0; void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange; // Wait for consumer to consume previous value before trampling it. - while (*slot != nullptr && !checkAbort(spins)); - directBuff = (T*)outputBuf; - // Encode pointer by XOR'ing against some address they definitely wouldn't send - // since we want to allow them sending us nullptr while not colliding with - // the empty slot value. - *slot = reinterpret_cast(reinterpret_cast(directBuff) ^ reinterpret_cast(slot)); + if (slot) { + while (*slot != nullptr && !checkAbort(spins)); + directBuff = (T*)outputBuf; + // Encode pointer by XOR'ing against some address they definitely wouldn't send + // since we want to allow them sending us nullptr while not colliding with + // the empty slot value. + *slot = reinterpret_cast(reinterpret_cast(directBuff) ^ reinterpret_cast(slot)); + } } if (Direct && sendAcceptor) { int spins = 0; void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange; void *ptr; - while (true) { + while (slot) { ptr = *slot; if (ptr != nullptr || checkAbort(spins)) break; } - directBuff = regUsed ? (T*)(e->dnOutputs[index]) : + + if (slot) { + directBuff = regUsed ? (T*)(e->dnOutputs[index]) : reinterpret_cast(reinterpret_cast(ptr) ^ reinterpret_cast(slot)); - *slot = nullptr; + *slot = nullptr; + } else { + /* slot is NULL, it must be regUsed == 1 */ + directBuff = (T*)e->dnOutputs[index]; + } } if (Direct && sendProvider) { int spins = 0; @@ -531,17 +589,19 @@ class Primitives< volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange; volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1; // Wait for consumer to consume previous value before trampling it. - while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins)); - // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter) - // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend) - directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf; - // Exchange pre-scalers for use in direct pull - *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg; - *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32); - // Encode pointer by XOR'ing against some address they definitely wouldn't send - // since we want to allow them sending us nullptr while not colliding with - // the empty slot value. - *slot = reinterpret_cast(reinterpret_cast(directBuff) ^ reinterpret_cast(slot)); + if (slot && argSlot0 && argSlot1) { + while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins)); + // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter) + // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend) + directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf; + // Exchange pre-scalers for use in direct pull + *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg; + *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32); + // Encode pointer by XOR'ing against some address they definitely wouldn't send + // since we want to allow them sending us nullptr while not colliding with + // the empty slot value. + *slot = reinterpret_cast(reinterpret_cast(directBuff) ^ reinterpret_cast(slot)); + } } if (Direct && recvAcceptor) { int spins = 0; @@ -549,24 +609,29 @@ class Primitives< volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange; volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1; void *ptr; - while (true) { + while (slot) { ptr = *slot; if (ptr != nullptr || checkAbort(spins)) break; } - directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) : - reinterpret_cast(reinterpret_cast(ptr) ^ reinterpret_cast(slot)); - if (MaxSend != 0) { // reduce group rather than gather group - // Store scalers for remote inputs - uint64_t arg0, arg1; - while (true) { - arg0 = *argSlot0; - arg1 = *argSlot1; - if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break; + + if (slot && argSlot0 && argSlot1) { + directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) : + reinterpret_cast(reinterpret_cast(ptr) ^ reinterpret_cast(slot)); + if (MaxSend != 0) { // reduce group rather than gather group + // Store scalers for remote inputs + uint64_t arg0, arg1; + while (true) { + arg0 = *argSlot0; + arg1 = *argSlot1; + if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break; + } + ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff); } - ncclShmem.redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff); + *argSlot0 = 0; *argSlot1 = 0; + *slot = nullptr; + } else { + directBuff = (T*)e->dnInputs[index]; } - *argSlot0 = 0; *argSlot1 = 0; - *slot = nullptr; } } @@ -594,6 +659,9 @@ class Primitives< __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) { genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false); } + __device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) { + genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false); + } __device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp); @@ -611,6 +679,9 @@ class Primitives< __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) { genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false); } + __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) { + genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false); + } __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp); } @@ -635,20 +706,20 @@ class Primitives< } __device__ __forceinline__ void - scatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) { + scatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) { ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false); } __device__ __forceinline__ void - directScatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) { + directScatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) { ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false); } __device__ __forceinline__ void - gather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp=false) { + gather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift, bool postOp=false) { ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, postOp); } __device__ __forceinline__ void - directGather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) { + directGather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) { ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false); } }; diff --git a/src/collectives/device/reduce.h b/src/device/reduce.h similarity index 99% rename from src/collectives/device/reduce.h rename to src/device/reduce.h index 0927037e93..627d9b119b 100644 --- a/src/collectives/device/reduce.h +++ b/src/device/reduce.h @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "devcomm.h" +#include "device.h" #include "collectives.h" #include "primitives.h" diff --git a/src/collectives/device/reduce_kernel.h b/src/device/reduce_kernel.h similarity index 67% rename from src/collectives/device/reduce_kernel.h rename to src/device/reduce_kernel.h index c1a39cedfe..66e9516cd3 100644 --- a/src/collectives/device/reduce_kernel.h +++ b/src/device/reduce_kernel.h @@ -12,6 +12,19 @@ #include #include +template +struct IsFloatingPoint: std::false_type {}; +template<> +struct IsFloatingPoint: std::true_type {}; +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +struct IsFloatingPoint<__nv_bfloat16>: std::true_type {}; +#endif +template<> +struct IsFloatingPoint: std::true_type {}; +template<> +struct IsFloatingPoint: std::true_type {}; + //////////////////////////////////////////////////////////////////////////////// // The reduction function classes. All classes must: // 1. Expose the `EltType` typedef. @@ -19,16 +32,21 @@ // 3. Have constructor taking `uint64_t opArg`. template -struct FuncNull { using EltType = T; __device__ FuncNull(uint64_t opArg=0) {}; }; +struct FuncCopy { using EltType = T; __device__ FuncCopy(uint64_t opArg=0) {}; }; template struct FuncSum { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; }; template struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; }; template -struct FuncMin { using EltType = T; __device__ FuncMin(uint64_t opArg=0) {}; }; -template -struct FuncMax { using EltType = T; __device__ FuncMax(uint64_t opArg=0) {}; }; - +struct FuncMinMax { + using EltType = T; + BytePack xormask; // only used by integers + bool isMinNotMax; // only used by floats + __device__ FuncMinMax(uint64_t opArg=0) { + xormask.native = opArg; + isMinNotMax = (opArg&1)==0; + } +}; template struct FuncPreMulSum; template struct FuncSumPostDiv; @@ -127,8 +145,8 @@ struct Apply_Reduce { // Base case definitions (EltPerPack == 1) template -struct Apply_Reduce, /*EltPerPack=*/1> { - __device__ static BytePack reduce(FuncSum fn, BytePack a, BytePack b) { +struct Apply_Reduce, /*EltPerPack=*/1> { + __device__ static BytePack reduce(FuncCopy fn, BytePack a, BytePack b) { return a; } }; @@ -145,15 +163,9 @@ struct Apply_Reduce, /*EltPerPack=*/1> { } }; template -struct Apply_Reduce, /*EltPerPack=*/1> { - __device__ static BytePack reduce(FuncMin fn, BytePack a, BytePack b) { - return toPack(min(fromPack(a), fromPack(b))); - } -}; -template -struct Apply_Reduce, /*EltPerPack=*/1> { - __device__ static BytePack reduce(FuncMax fn, BytePack a, BytePack b) { - return toPack(max(fromPack(a), fromPack(b))); +struct Apply_Reduce, /*EltPerPack=*/1> { + __device__ static BytePack reduce(FuncMinMax fn, BytePack a, BytePack b) { + return (a.native ^ fn.xormask.native) < (b.native ^ fn.xormask.native) ? a : b; } }; @@ -161,57 +173,55 @@ struct Apply_Reduce, /*EltPerPack=*/1> { template<> struct Apply_Reduce, /*EltPerPack=*/4> { __device__ static BytePack<4> reduce(FuncSum fn, BytePack<4> a, BytePack<4> b) { - constexpr uint32_t lo = 0x00ff00ff; - constexpr uint32_t hi = ~lo; - uint32_t x = a.u32; - uint32_t y = b.u32; - a.u32 = (((x&lo) + (y&lo))&lo) + (((x&hi) + (y&hi))&hi); + constexpr uint32_t even = 0x00ff00ffu; + uint32_t x = (a.native & even) + (b.native & even); + uint32_t y = (a.native & ~even) + (b.native & ~even); + //a.native = (x & even) | (y & ~even); + a.native = __byte_perm(x, y, 0x7250); return a; } }; + template<> -struct Apply_Reduce, /*EltPerPack=*/4> { - __device__ static BytePack<4> reduce(FuncSum fn, BytePack<4> a, BytePack<4> b) { - return Apply_Reduce, 4>::reduce(FuncSum(), a, b); +struct Apply_Reduce, /*EltPerPack=*/4> { + __device__ static BytePack<4> reduce(FuncMinMax fn, BytePack<4> a, BytePack<4> b) { + constexpr uint32_t ones = 0x01010101u; + constexpr uint32_t even = 0x00ff00ffu; // even byte mask + // Replicate xormask to all bytes + uint32_t x = fn.xormask.native * ones; + // Transform inputs by xormask + uint32_t ax = a.native ^ x; + uint32_t bx = b.native ^ x; + // Use 9-bit arithmetic to compute d=a-b + uint32_t d0 = (ax & even) + (~bx & even) + ones; + uint32_t d1 = (ax>>8 & even) + (~(bx>>8) & even) + ones; + // Move sign bit of each 9-bit delta into the least bit of origin byte + //uint32_t s = (d0>>8 & ones & even) | (d1 & ones & ~even); + uint32_t s = __byte_perm(d0, d1, 0x7351) & ones; + // Broadcast least bit across whole byte + s *= 0xffu; + // Compose result by selecting bytes via: signbit(a-b)==1 ? a : b + a.native = (a.native & s) | (b.native & ~s); + return a; } }; -#if 300 <= __CUDA_ARCH__ && __CUDA_ARCH__ < 500 - template<> - struct Apply_Reduce, /*EltPerPack=*/4> { - __device__ static BytePack<4> reduce(FuncMin fn, BytePack<4> a, BytePack<4> b) { - uint32_t z=0; - asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z)); - return a; - } - }; - template<> - struct Apply_Reduce, /*EltPerPack=*/4> { - __device__ static BytePack<4> reduce(FuncMin fn, BytePack<4> a, BytePack<4> b) { - int32_t z=0; - asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z)); - return a; - } - }; - template<> - struct Apply_Reduce, /*EltPerPack=*/4> { - __device__ static BytePack<4> reduce(FuncMax fn, BytePack<4> a, BytePack<4> b) { - uint32_t z=0; - asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z)); - return a; - } - }; - template<> - struct Apply_Reduce, /*EltPerPack=*/4> { - __device__ static BytePack<4> reduce(FuncMax fn, BytePack<4> a, BytePack<4> b) { - int32_t z=0; - asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z)); - return a; - } - }; -#endif +template<> +struct Apply_Reduce, /*EltPerPack=*/4> { + __device__ static BytePack<4> reduce(FuncProd fn, BytePack<4> apack, BytePack<4> bpack) { + uint32_t a = apack.native; + uint32_t b = bpack.native; + uint32_t ab0 = (a*b) & 0xffu; + asm("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u)); + uint32_t ab1; + asm("mul.hi.u32 %0, %1, %2;" : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000)); + asm("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u)); + apack.native = __byte_perm(ab0, ab1, 0x6420); + return apack; + } +}; -#define SPECIALIZE_REDUCE(Fn, T, EltPerPack, Vec, expr_of_x_y) \ +#define SPECIALIZE_REDUCE(Fn, T, EltPerPack, Vec, expr_of_fn_x_y) \ template<> \ struct Apply_Reduce, EltPerPack> { \ __device__ __forceinline__ static BytePack reduce( \ @@ -219,10 +229,13 @@ struct Apply_Reduce, /*EltPerPack=*/4> { ) { \ Vec x = fromPack(a); \ Vec y = fromPack(b); \ - return toPack(expr_of_x_y); \ + return toPack(expr_of_fn_x_y); \ } \ }; +SPECIALIZE_REDUCE(FuncMinMax, float, 1, float, fn.isMinNotMax ? fminf(x, y) : fmaxf(x, y)) +SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : fmax(x, y)) + #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 SPECIALIZE_REDUCE(FuncSum, half, 1, half, __hadd(x, y)) SPECIALIZE_REDUCE(FuncSum, half, 2, half2, __hadd2(x, y)) @@ -234,13 +247,10 @@ struct Apply_Reduce, /*EltPerPack=*/4> { #endif #if __CUDA_ARCH__ >= 800 - SPECIALIZE_REDUCE(FuncMin, half, 1, half, __hmin(x, y)) - SPECIALIZE_REDUCE(FuncMin, half, 2, half2, __hmin2(x, y)) - SPECIALIZE_REDUCE(FuncMax, half, 1, half, __hmax(x, y)) - SPECIALIZE_REDUCE(FuncMax, half, 2, half2, __hmax2(x, y)) + SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y)) + SPECIALIZE_REDUCE(FuncMinMax, half, 2, half2, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y)) #else - SPECIALIZE_REDUCE(FuncMin, half, 1, half, __float2half(fminf(__half2float(x), __half2float(y)))) - SPECIALIZE_REDUCE(FuncMax, half, 1, half, __float2half(fmaxf(__half2float(x), __half2float(y)))) + SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, __float2half(fn.isMinNotMax ? fminf(__half2float(x), __half2float(y)) : fmaxf(__half2float(x), __half2float(y)))) #endif #if defined(__CUDA_BF16_TYPES_EXIST__) @@ -249,15 +259,12 @@ struct Apply_Reduce, /*EltPerPack=*/4> { SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y)) SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y)) SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y)) - SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 1, __nv_bfloat16, __hmin(x, y)) - SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 2, __nv_bfloat162, __hmin2(x, y)) - SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 1, __nv_bfloat16, __hmax(x, y)) - SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 2, __nv_bfloat162, __hmax2(x, y)) + SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y)) + SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y)) #else SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) + __bfloat162float(y))) SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) * __bfloat162float(y))) - SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fminf(__bfloat162float(x), __bfloat162float(y)))) - SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fmaxf(__bfloat162float(x), __bfloat162float(y)))) + SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fn.isMinNotMax ? fminf(__bfloat162float(x), __bfloat162float(y)) : fmaxf(__bfloat162float(x), __bfloat162float(y)))) #endif #endif @@ -479,19 +486,6 @@ struct Apply_PreOp, /*EltPerPack=*/1> { //////////////////////////////////////////////////////////////////////////////// // FuncSumPostDiv -template -struct IsFloatingPoint: std::false_type {}; -template<> -struct IsFloatingPoint: std::true_type {}; -#if defined(__CUDA_BF16_TYPES_EXIST__) -template<> -struct IsFloatingPoint<__nv_bfloat16>: std::true_type {}; -#endif -template<> -struct IsFloatingPoint: std::true_type {}; -template<> -struct IsFloatingPoint: std::true_type {}; - template::value> struct FuncSumPostDiv_IntOnly; @@ -543,25 +537,44 @@ struct Apply_PostOp, /*EltPerPack=*/1> { #define SIZEOF_BytePack_field_u64 8 #define PTX_REG_BytePack_field_u64 "l" -#define DEFINE_Apply_LoadMultimem(Fn, T, op, ptx_ty, pack_field) \ +#define DEFINE_Apply_LoadMultimem_sum(T, ptx_ty, pack_field) \ template<> \ - struct Apply_LoadMultimem, SIZEOF_BytePack_field_##pack_field> { \ + struct Apply_LoadMultimem, SIZEOF_BytePack_field_##pack_field> { \ static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \ - __device__ static BytePack load(Fn fn, uintptr_t addr) { \ + __device__ static BytePack load(FuncSum fn, uintptr_t addr) { \ BytePack ans; \ - asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \ + asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ : "l"(addr)); \ return ans; \ } \ }; -#define DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \ +#define DEFINE_Apply_LoadMultimem_minmax(T, ptx_ty, pack_field) \ template<> \ - struct Apply_LoadMultimem, 4*(SIZEOF_BytePack_field_##pack_field)> { \ - static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \ - __device__ static BytePack load(Fn fn, uintptr_t addr) { \ + struct Apply_LoadMultimem, SIZEOF_BytePack_field_##pack_field> { \ + static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \ + __device__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ BytePack ans; \ - asm("multimem.ld_reduce.relaxed.sys.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ + if (fn.isMinNotMax) { \ + asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \ + : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ + : "l"(addr)); \ + } else { \ + asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \ + : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ + : "l"(addr)); \ + } \ + return ans; \ + } \ + }; + +#define DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \ + template<> \ + struct Apply_LoadMultimem, 4*(SIZEOF_BytePack_field_##pack_field)> { \ + static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \ + __device__ static BytePack load(FuncSum fn, uintptr_t addr) { \ + BytePack ans; \ + asm("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ @@ -570,18 +583,61 @@ struct Apply_PostOp, /*EltPerPack=*/1> { return ans; \ } \ }; -#define DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(Fn, T, op, ptx_ty, pack_field) \ - DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \ +#define DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \ template<> \ - struct Apply_LoadMultimem, sizeof(T)> { \ - __device__ static BytePack load(Fn fn, uintptr_t addr) { \ + struct Apply_LoadMultimem, 4*(SIZEOF_BytePack_field_##pack_field)> { \ + static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \ + __device__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ + BytePack ans; \ + if (fn.isMinNotMax) { \ + asm("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ + : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ + "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ + "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ + "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \ + : "l"(addr)); \ + } else { \ + asm("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ + : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ + "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ + "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ + "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \ + : "l"(addr)); \ + } \ + return ans; \ + } \ + }; + +#define DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(T, ptx_ty, pack_field) \ + DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \ + template<> \ + struct Apply_LoadMultimem, sizeof(T)> { \ + __device__ static BytePack load(FuncSum fn, uintptr_t addr) { \ BytePack<2*sizeof(T)> tmp; \ - asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \ + asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ : "l"(addr & -uintptr_t(sizeof(T)))); \ return tmp.half[(addr/sizeof(T))%2]; \ } \ }; +#define DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(T, ptx_ty, pack_field) \ + DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \ + template<> \ + struct Apply_LoadMultimem, sizeof(T)> { \ + __device__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ + BytePack<2*sizeof(T)> tmp; \ + if (fn.isMinNotMax) { \ + asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \ + : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ + : "l"(addr & -uintptr_t(sizeof(T)))); \ + } else { \ + asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \ + : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ + : "l"(addr & -uintptr_t(sizeof(T)))); \ + } \ + return tmp.half[(addr/sizeof(T))%2]; \ + } \ + }; template struct Apply_LoadMultimem { @@ -598,46 +654,39 @@ struct Apply_LoadMultimem { static constexpr bool IsSum = std::is_same>::value || std::is_same>::value || std::is_same>::value; - static constexpr bool IsMinOrMax = std::is_same>::value || - std::is_same>::value; + static constexpr bool IsMinMax = std::is_same>::value; static constexpr bool IsFloat = IsFloatingPoint::value; static constexpr int BigPackSize = IsFloat && IsSum && sizeof(T) < 8 ? 16 : IsFloat && IsSum ? 8 : - IsFloat && IsMinOrMax && sizeof(T)==2 ? 16 : - !IsFloat && (IsSum||IsMinOrMax) && sizeof(T)>=4 ? sizeof(T) : + IsFloat && IsMinMax && sizeof(T)==2 ? 16 : + !IsFloat && (IsSum||IsMinMax) && sizeof(T)>=4 ? sizeof(T) : /*multimem.ld_reduce not supported:*/ 0; }; - DEFINE_Apply_LoadMultimem(FuncSum, uint32_t, add, u32, u32) - DEFINE_Apply_LoadMultimem(FuncMin, uint32_t, min, u32, u32) - DEFINE_Apply_LoadMultimem(FuncMax, uint32_t, max, u32, u32) + DEFINE_Apply_LoadMultimem_sum(uint32_t, u32, u32) + DEFINE_Apply_LoadMultimem_minmax(uint32_t, u32, u32) - DEFINE_Apply_LoadMultimem(FuncSum, int32_t, add, s32, u32) - DEFINE_Apply_LoadMultimem(FuncMin, int32_t, min, s32, u32) - DEFINE_Apply_LoadMultimem(FuncMax, int32_t, max, s32, u32) + DEFINE_Apply_LoadMultimem_sum(int32_t, s32, u32) + DEFINE_Apply_LoadMultimem_minmax(int32_t, s32, u32) - DEFINE_Apply_LoadMultimem(FuncSum, uint64_t, add, u64, u64) - DEFINE_Apply_LoadMultimem(FuncMin, uint64_t, min, u64, u64) - DEFINE_Apply_LoadMultimem(FuncMax, uint64_t, max, u64, u64) + DEFINE_Apply_LoadMultimem_sum(uint64_t, u64, u64) + DEFINE_Apply_LoadMultimem_minmax(uint64_t, u64, u64) - DEFINE_Apply_LoadMultimem(FuncSum, int64_t, add, u64, u64) - DEFINE_Apply_LoadMultimem(FuncMin, int64_t, min, s64, u64) - DEFINE_Apply_LoadMultimem(FuncMax, int64_t, max, s64, u64) + DEFINE_Apply_LoadMultimem_sum(int64_t, u64, u64) + DEFINE_Apply_LoadMultimem_minmax(int64_t, s64, u64) - DEFINE_Apply_LoadMultimem(FuncSum, float, add, f32, u32) - DEFINE_Apply_LoadMultimem_v4(FuncSum, float, add, f32, u32) + DEFINE_Apply_LoadMultimem_sum(float, f32, u32) + DEFINE_Apply_LoadMultimem_sum_v4(float, f32, u32) - DEFINE_Apply_LoadMultimem(FuncSum, double, add, f64, u64) + DEFINE_Apply_LoadMultimem_sum(double, f64, u64) - DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, half, add, f16x2, u32) - DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, half, min, f16x2, u32) - DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, half, max, f16x2, u32) + DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(half, f16x2, u32) + DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(half, f16x2, u32) #if defined(__CUDA_BF16_TYPES_EXIST__) - DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, __nv_bfloat16, add, bf16x2, u32) - DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, __nv_bfloat16, min, bf16x2, u32) - DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, __nv_bfloat16, max, bf16x2, u32) + DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32) + DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32) #endif #else template diff --git a/src/collectives/device/reduce_scatter.h b/src/device/reduce_scatter.h similarity index 58% rename from src/collectives/device/reduce_scatter.h rename to src/device/reduce_scatter.h index d2026e678c..6660cc0adc 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/device/reduce_scatter.h @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "devcomm.h" +#include "device.h" #include "collectives.h" #include "primitives.h" @@ -98,33 +98,69 @@ struct RunWorkElementlastChunkSize); const ssize_t size = args->count; const ssize_t loopSize = nChannels*chunkSize; + const int rank = ncclShmem.comm.rank; + const int nranks = ncclShmem.comm.nRanks; - const int nThreadsScatter = 128 + WARP_SIZE; - const int nThreadsReduce = 384; + /* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync; + * if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth + * and the rest are allocated to scatter. */ + const int nThreadsReduce = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE); + const int nThreadsScatter = args->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce); const int tidEndScatter = nThreadsScatter; const int tidEndReduce = tidEndScatter + nThreadsReduce; - using Proto = ProtoSimple<1, 1>; - - if (tid < tidEndScatter) { - // Scatter - Primitives, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, - args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0); - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + bid*chunkSize; - int nelem = min(chunkSize, size-offset); - prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0); + if (!args->regUsed) { + if (tid < tidEndScatter) { + // Scatter + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, + args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + bid * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.scatter(offset, nvls->nHeads * size, nelem, size, -1, 0); + } + } else if (tid < tidEndReduce) { + // Reduce through NVLS + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, + args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + bid * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.recv(offset, nelem); + } } - } else if (tid < tidEndReduce) { - // Reduce through NVLS - Primitives, /*Direct=*/0, Proto, 0> - prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, - args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + bid*chunkSize; - int nelem = min(chunkSize, size-offset); - prims.recv(offset, nelem); + } else { + if (tid < tidEndScatter) { + // Scatter + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL, + args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + prims.scatter(0, 0, 0, 0, -1, 0); + } + + /* gather used as sync */ + prims.gather(0, 0, 0, 0, -1, 0); + } else if (tid < tidEndReduce) { + // Reduce through NVLS + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, args->recvbuff, + args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t outOffset = gridOffset + bid * chunkSize; + ssize_t inpOffset = outOffset + rank * size; + int nelem = min(chunkSize, size - outOffset); + prims.directRecvCopy(inpOffset, outOffset, nelem); + } + + /* send for sync */ + prims.send(0, 0); } } } diff --git a/src/collectives/device/sendrecv.h b/src/device/sendrecv.h similarity index 95% rename from src/collectives/device/sendrecv.h rename to src/device/sendrecv.h index 42d9b550db..5401f0542c 100644 --- a/src/collectives/device/sendrecv.h +++ b/src/device/sendrecv.h @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "devcomm.h" +#include "device.h" #include "collectives.h" #include "primitives.h" @@ -26,7 +26,7 @@ struct RunWork { if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; int const peer = args->peer; Primitives, 1, Proto, 1> prims - (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1); + (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T)); size_t offset = 0; do { int nelem = min(size_t(chunkSize), count-offset); @@ -45,7 +45,7 @@ struct RunWork { if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize int const peer = args->peer; Primitives, 1, Proto, 1> prims - (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1); + (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T)); size_t offset = 0; do { int nelem = min(size_t(chunkSize), count-offset); diff --git a/src/enqueue.cc b/src/enqueue.cc index 43d0ba109a..dbb9865bcf 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -11,83 +11,16 @@ #include "bootstrap.h" #include "channel.h" #include "cudawrap.h" +#include "transport.h" #include // std::memcpy #include // PRIx64 -static void* const ncclKernelGeneric = (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t); - -struct ncclKernelMatch { - void* kernelFn; - bool specialized; -}; - -// Only generate inline kernels for LL -#define NCCL_FUNC5(func, algo, devredop, dtype, specialized) \ - /*LL */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), true && specialized}, \ - /*LL128 */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized}, \ - /*SIMPLE*/{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized} - -#define NCCL_FUNC4(func, devredop, type, specialized) \ - NCCL_FUNC5(func, TREE, devredop, type, specialized), \ - NCCL_FUNC5(func, RING, devredop, type, specialized), \ - NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \ - NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, specialized), \ - NCCL_FUNC5(func, NVLS, devredop, type, specialized), \ - NCCL_FUNC5(func, NVLS_TREE, devredop, type, specialized) - -#ifdef __CUDA_BF16_TYPES_EXIST__ - #define HAVE_BFLOAT16 1 -#else - #define HAVE_BFLOAT16 0 -#endif - -// Must be consistent with ncclDataType_t -#define NCCL_FUNCS3(func, devredop, reduction, specialized) \ - NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int8_t, int8_t), specialized), \ - NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint8_t, int8_t), specialized), \ - NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int32_t, int8_t), specialized), \ - NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint32_t, int8_t), specialized), \ - NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int64_t, int8_t), specialized), \ - NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint64_t, int8_t), specialized), \ - NCCL_FUNC4(func, devredop, MACRO_IF(reduction, half, int8_t), specialized), \ - NCCL_FUNC4(func, devredop, MACRO_IF(reduction, float, int8_t), specialized), \ - NCCL_FUNC4(func, devredop, MACRO_IF(reduction, double, int8_t), specialized) \ - MACRO_IF(HAVE_BFLOAT16, \ - SINGLE_ARG(, NCCL_FUNC4(func, devredop, MACRO_IF(reduction, __nv_bfloat16, int8_t), specialized)), \ - /*nothing*/ \ - ) - -// Must be consistent with ncclDevRedOp_t -- but we only generate kernel for sums. -#define NCCL_FUNCS2(func, reduction) \ - NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/1), /*Sum*/ \ - NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Prod*/ \ - NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Max*/ \ - NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Min*/ \ - NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*PreMulSum*/ \ - NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0) /*SumPostDiv*/ - -// Must be consistent with the ncclFuncSet enum -static const ncclKernelMatch ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { - {(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true}, - // We don't bake special kernels for the one-rank reductions - {/*int8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, - {/*uint8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, - {/*int32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, - {/*uint32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, - {/*int64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, - {/*uint64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, - {/*half*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, - {/*float*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, - {/*double*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, - #if HAVE_BFLOAT16 - {/*bfloat16*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, - #endif - NCCL_FUNCS2(Broadcast, /*reduction=*/0), - NCCL_FUNCS2(Reduce, /*reduction=*/1), - NCCL_FUNCS2(AllGather, /*reduction=*/0), - NCCL_FUNCS2(ReduceScatter, /*reduction=*/1), - NCCL_FUNCS2(AllReduce, /*reduction=*/1) +enum ncclRegBufferType { + NCCL_REGULAR_BUFFER = 0, + NCCL_IPC_REG_BUFFER = 1, + NCCL_NVLS_REG_BUFFER = 2, + NCCL_REG_BUFFER_NUM = 3 }; static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */); @@ -96,19 +29,14 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0); // Returns maximum kernel stack size of all CUDA kernels ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) { - constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]); ncclResult_t result = ncclSuccess; if (maxStackSize) *maxStackSize = 0; int carveout = ncclParamL1SharedMemoryCarveout(); - // Keep track if we already visited a function pointer. - void* lru[2] = {nullptr, nullptr}; - for (int i=0; i < KernelCount; i++) { - void* fn = ncclKerns[i].kernelFn; - if (fn == lru[0] || fn == lru[1]) goto next_kernel; - lru[1] = lru[0]; - lru[0] = fn; + for (int k=0; k < ncclDevKernelCount; k++) { + void* fn = ncclDevKernelList[k]; + if (fn == nullptr) continue; if (maxStackSize) { cudaFuncAttributes attr = {0}; @@ -116,14 +44,12 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) { if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes; ignore0:; } - if (carveout) { CUDACHECKGOTO(cudaFuncSetAttribute(fn, cudaFuncAttributePreferredSharedMemoryCarveout, carveout), result, ignore1); ignore1:; } - if (ncclShmemDynamicSize(cudaArch) != 0) { CUDACHECKGOTO(cudaFuncSetAttribute(fn, cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)), @@ -218,7 +144,7 @@ static void appendWorkElemP2p( struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, struct ncclWorkElemP2p const *elem, bool fuseOk ) { - constexpr int funcIndex = FUNC_INDEX_P2P; + int funcIndex = ncclDevFuncId_P2p(); struct ncclKernelPlan::Channel* chan = &plan->channels[channelId]; struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue); if (q && funcIndex == q->work.header.funcIndex) { @@ -240,7 +166,7 @@ static void appendWorkElemP2p( } q = ncclMemoryStackAlloc(&comm->memScoped); q->work.header.type = ncclWorkTypeP2p; - q->work.header.funcIndex = FUNC_INDEX_P2P; + q->work.header.funcIndex = ncclDevFuncId_P2p(); chan->p2pTailElem[ncclWorkP2pTypeRecv-1] = 0; chan->p2pTailElem[ncclWorkP2pTypeSend-1] = 1; q->work.p2pElems[chan->p2pTailElem[elem->p2pType-1]] = *elem; // C++ struct assignment @@ -265,7 +191,7 @@ static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelP static ncclResult_t addCollToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex, struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp, - int nCollChannels, int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[] + int nCollChannels, int nBid, size_t bytes, ncclRegBufferType regBufType, void* regBufSend[], void* regBufRecv[] ) { struct ncclKernelPlan::Channel *chans = plan->channels; @@ -307,10 +233,9 @@ static ncclResult_t addCollToPlan( // Add work elem *nWorkBudget += chans[c].nWork; - if (!regBufUsed) { + if (regBufType == NCCL_REGULAR_BUFFER) { appendWorkElemColl(comm, plan, c, funcIndex, workElem, bid); - } else { - // Buffer registration in play which could only for CollNet at the moment. + } else if (regBufType == NCCL_IPC_REG_BUFFER) { struct ncclChannel* channel = &comm->channels[c]; struct ncclWorkElemReg workElemReg; workElemReg.elem = *workElem; // C++ struct assignment @@ -330,6 +255,18 @@ static ncclResult_t addCollToPlan( workElemReg.upOutputs[i] = regBufRecv[j]; } appendWorkElemColl(comm, plan, c, funcIndex, &workElemReg, bid); + } else if (regBufType == NCCL_NVLS_REG_BUFFER) { + struct ncclWorkElemReg workElemReg; + workElemReg.elem = *workElem; // C++ struct assignment + workElemReg.elem.regUsed = 1; + /* NVLS only has one send and recv buffer registered */ + workElemReg.dnInputs[0] = regBufSend[0]; + workElemReg.dnOutputs[0] = regBufRecv[0]; + appendWorkElemColl(comm, plan, c, funcIndex, &workElemReg, bid); + } else { + /* impossible value */ + WARN("Invalid regBufType %d\n", regBufType); + return ncclInvalidArgument; } *nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork @@ -417,68 +354,118 @@ static void finishPlan(struct ncclKernelPlan* plan) { plan->threadPerBlock = std::max(plan->threadPerBlock, 3*WARP_SIZE); } +int64_t ncclParamLocalRegister(); +NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 1); + static ncclResult_t registerIntraNodeBuffers( struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclInfo* info, - bool* outRegBufUsed, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], - void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS] + void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], + ncclRegBufferType *outRegBufType ) { - *outRegBufUsed = false; ncclResult_t result = ncclSuccess; + *outRegBufType = NCCL_REGULAR_BUFFER; #if CUDART_VERSION >= 11030 - int localRank = comm->localRank; + if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) { + bool regBufUsed = false; + const void *sendbuff = info->sendbuff; + void *recvbuff = info->recvbuff; + cudaPointerAttributes sattr, rattr; + bool query = false; + + if (info->coll == ncclFuncAllGather) + sendbuff = NULL; + else if (info->coll == ncclFuncReduceScatter) + recvbuff = NULL; - if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess; + /* first try local registration. */ + if (ncclParamLocalRegister()) { + CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff)); + CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff)); + query = true; + if (sattr.type == cudaMemoryTypeDevice && rattr.type == cudaMemoryTypeDevice) + ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, ®BufUsed, outRegBufSend, outRegBufRecv); + } - struct HandlePair { - cudaIpcMemHandle_t ipc[2]; // {send, recv} - size_t offset[2]; // {send, recv} - }; - struct HandlePair handles[NCCL_MAX_LOCAL_RANKS]; + if (regBufUsed == false && plan->persistent && ncclParamGraphRegister()) { + if (!query) { + CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff)); + CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff)); + } + if (sattr.type == cudaMemoryTypeDevice && rattr.type == cudaMemoryTypeDevice) + ncclNvlsGraphRegisterBuffer(comm, plan, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, ®BufUsed, outRegBufSend, outRegBufRecv); + } - CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback); - CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback); + if (regBufUsed) { + /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to + * saturate bandwidth. */ + if (info->coll == ncclFuncReduceScatter) + info->nChannels = std::min(5, comm->nvlsChannels); + else + info->nChannels = std::min(4, comm->nvlsChannels); + *outRegBufType = NCCL_NVLS_REG_BUFFER; + } + } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now + comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other + comm->intraRanks < comm->localRanks && // only with inter-process & intra-node peers + plan->persistent && 0) { + /* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */ + int localRank = comm->localRank; + cudaPointerAttributes sattr, rattr; - void *baseSend, *baseRecv; - size_t size; - CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff)); - handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend; - CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff)); - handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv; + CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff)); + CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff)); + if (sattr.type != cudaMemoryTypeDevice || rattr.type != cudaMemoryTypeDevice) return ncclSuccess; - NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair))); + if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess; - // Open handles locally - for (int i=0; i < comm->localRanks; i++) { - if (i == localRank) { // Skip self - outRegBufSend[i] = nullptr; - outRegBufRecv[i] = nullptr; - } else { - for (int sr=0; sr < 2; sr++) { - // Get base address of mapping - void* base; - CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess)); - // Get real buffer address by adding offset in the mapping - (sr==0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr]; - // Enqueue reminder to close memory handle - struct ncclPointerList* q = ncclMemoryPoolAlloc(&comm->memPool_ncclPointerList, &comm->memPermanent); - q->ptr = base; - ncclIntruQueueEnqueue(&plan->ipcMemQueue, q); + struct HandlePair { + cudaIpcMemHandle_t ipc[2]; // {send, recv} + size_t offset[2]; // {send, recv} + }; + struct HandlePair handles[NCCL_MAX_LOCAL_RANKS]; + + CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback); + CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback); + + void *baseSend, *baseRecv; + size_t size; + CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff)); + handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend; + CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff)); + handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv; + + NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair))); + + // Open handles locally + for (int i=0; i < comm->localRanks; i++) { + if (i == localRank) { // Skip self + outRegBufSend[i] = nullptr; + outRegBufRecv[i] = nullptr; + } else { + for (int sr=0; sr < 2; sr++) { + // Get base address of mapping + void* base; + CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess)); + // Get real buffer address by adding offset in the mapping + (sr==0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr]; + // Enqueue reminder to close memory handle + struct ncclPointerList* q = ncclMemoryPoolAlloc(&comm->memPool_ncclPointerList, &comm->memPermanent); + q->ptr = base; + ncclIntruQueueEnqueue(&plan->ipcMemQueue, q); + } } } + *outRegBufType = NCCL_IPC_REG_BUFFER; } - *outRegBufUsed = true; - fallback: #endif return result; } -NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 0); - -static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport); -static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps); +static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport); +static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetSupport, int nvlsSupport, int numPipeOps); static ncclResult_t scheduleCollTasksToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget @@ -517,6 +504,7 @@ static ncclResult_t scheduleCollTasksToPlan( int nAggChannels = 0; int nAggOps = 1; struct ncclTaskColl* aggEnd = head->next; + int nvlsSupport = comm->nvlsSupport && ncclNvlsSupported(aggInfo.opFull.op, aggInfo.datatype); int collNetSupport = 0; NCCLCHECK(getCollNetSupport(&aggInfo, &collNetSupport)); @@ -537,7 +525,7 @@ static ncclResult_t scheduleCollTasksToPlan( NCCLCHECK(ncclInfoSetDerived(&aggInfo, comm->nRanks)); aggInfo.nChannels = std::min(comm->nChannels, nAggChannels); int opPerChannel = DIVUP(nAggChannels, aggInfo.nChannels); - NCCLCHECK(getAlgoInfo(&aggInfo, collNetSupport, opPerChannel)); + NCCLCHECK(getAlgoInfo(&aggInfo, collNetSupport, nvlsSupport, opPerChannel)); } while (head != aggEnd) { @@ -566,23 +554,26 @@ static ncclResult_t scheduleCollTasksToPlan( int workFuncIndex; struct ncclWorkElem workElem = {}; struct ncclProxyOp proxyOp = {}; - NCCLCHECK(computeColl(&info, &workFuncIndex, &workElem, &proxyOp)); + // Check whether algo and proto have been preset (as in aggregation case) + // If so, skip the calculation + if (info.nChannels <= 0 || info.nThreads <= 0) { + NCCLCHECK(getAlgoInfo(&info, collNetSupport, nvlsSupport, 1)); + } if (*nWorkBudget < info.nChannels) return ncclSuccess; // Ensure room for addCollToPlan() - bool regBufUsed = false; + /* if possible, start registration */ + ncclRegBufferType regBufType = NCCL_REGULAR_BUFFER; void* regBufSend[NCCL_MAX_LOCAL_RANKS]; void* regBufRecv[NCCL_MAX_LOCAL_RANKS]; - if (plan->persistent && ncclParamGraphRegister() && - info.algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now - comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other - comm->intraRanks < comm->localRanks) { // only with inter-process & intra-node peers - NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, ®BufUsed, regBufSend, regBufRecv)); - } + + registerIntraNodeBuffers(comm, plan, &info, regBufSend, regBufRecv, ®BufType); + + NCCLCHECK(computeColl(&info, &workFuncIndex, &workElem, &proxyOp)); int maxChannels = info.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels; NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp, - maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv)); + maxChannels, info.nChannels, info.nBytes, regBufType, regBufSend, regBufRecv)); tasks->nTasksColl -= 1; tasks->collBytesTotal -= info.nBytes; ncclIntruQueueDequeue(&tasks->collQueue); @@ -590,8 +581,8 @@ static ncclResult_t scheduleCollTasksToPlan( plan->threadPerBlock = std::max(plan->threadPerBlock, info.nThreads); if (!plan->kernelSpecialized) { - plan->kernelFn = ncclKerns[workFuncIndex].kernelFn; - plan->kernelSpecialized = ncclKerns[workFuncIndex].specialized; + plan->kernelFn = ncclDevKernelForFunc[workFuncIndex]; + plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[workFuncIndex]; } } } @@ -619,8 +610,8 @@ static ncclResult_t scheduleP2pTasksToPlan( plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS); if (!plan->kernelSpecialized) { - plan->kernelFn = ncclKerns[FUNC_INDEX_P2P].kernelFn; - plan->kernelSpecialized = ncclKerns[FUNC_INDEX_P2P].specialized; + plan->kernelFn = ncclDevKernelForFunc[ncclDevFuncId_P2p()]; + plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[ncclDevFuncId_P2p()]; } // Compute how much to split operations @@ -893,6 +884,13 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr)); ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q); } + /* free mcHandle */ + while (!ncclIntruQueueEmpty(&plan->nvlsMcHandleQueue)) { + struct ncclNvlsMcHandleList* obj = ncclIntruQueueDequeue(&plan->nvlsMcHandleQueue); + NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size)); + INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size); + ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, obj); + } } ncclMemoryPoolTakeAll(&comm->memPool_ncclProxyOp, &plan->memPool_ncclProxyOp); ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); @@ -1142,45 +1140,64 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { /* Enqueueing system : computation of kernel and proxy operations parameters */ /*****************************************************************************/ -static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) { +static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport) { // Translate ncclAvg and PreMulSum ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op; - *collNetTypeSupport = info->comm->collNetSupportMatrix[netOp][info->datatype]; + *collNetSupport = info->comm->collNetSupport && info->comm->collNetSupportMatrix[netOp][info->datatype]; return ncclSuccess; } // numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency. -static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps) { +static ncclResult_t topoGetAlgoInfo(struct ncclInfo* info, int collNetSupport, int nvlsSupport, int numPipeOps) { struct ncclComm* comm = info->comm; if (comm->nRanks == 1) { info->algorithm = NCCL_ALGO_RING; info->protocol = NCCL_PROTO_SIMPLE; } - else { + else if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) { float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete. + float backupMinTime = 3600000000.0; + bool backup = false; + int backupAlgo = NCCL_ALGO_UNDEF; // back up algo and proto if no algo/proto is picked up. + int backupProto = NCCL_PROTO_UNDEF; // Find algorithm / protocol. info->algorithm = -1; info->protocol = -1; int nAlgos = NCCL_NUM_ALGORITHMS; for (int a=0; adatatype, info->opFull.op)) continue; - if (a == NCCL_ALGO_NVLS && collNetTypeSupport != 1 && comm->nNodes > 1) continue; - if (a == NCCL_ALGO_NVLS_TREE && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue; + if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue; + if (a == NCCL_ALGO_NVLS && nvlsSupport != 1 && info->coll != ncclFuncAllGather) continue; + if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue; + /* now we only support single-node NVLS allgather and reducescatter */ + if (a == NCCL_ALGO_NVLS && (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) && comm->nNodes > 1) continue; + if (a == NCCL_ALGO_NVLS_TREE && nvlsSupport != 1) continue; for (int p=0; p= 0 && time < minTime) { - info->algorithm = a; - info->protocol = p; - minTime = time; + NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time, &backup)); + if (!backup) { + if (time >= 0 && time < minTime) { + info->algorithm = a; + info->protocol = p; + minTime = time; + } + } else { + if (time >= 0 && time < backupMinTime) { + backupAlgo = a; + backupProto = p; + backupMinTime = time; + } } } } - if (info->algorithm == -1 || info->protocol == -1) { - WARN("Error : no algorithm/protocol available"); - return ncclInternalError; + + if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) { + if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) { + WARN("Error : no algorithm/protocol available"); + return ncclInternalError; + } + info->algorithm = backupAlgo; + info->protocol = backupProto; } //if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime); TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime); @@ -1222,6 +1239,25 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i return ncclSuccess; } +// Use the default topo-based tuner if tuner plugin is not successful. +// Call the plugin first. Let it set algo+proto, and/or nChannels. +// Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto. +// Finally, nChannels will be overriden by the plugin setting. +static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetSupport, int nvlsSupport, int numPipeOps) { + info->algorithm = NCCL_ALGO_UNDEF; + info->protocol = NCCL_PROTO_UNDEF; + int nChannels = 0; + if (info->comm->tuner != NULL) { + NCCLCHECK(info->comm->tuner->getCollInfo( + info->coll, info->nBytes, + collNetSupport, nvlsSupport, numPipeOps, + &info->algorithm, &info->protocol, &nChannels)); + } + NCCLCHECK(topoGetAlgoInfo(info, collNetSupport, nvlsSupport, numPipeOps)); + if (nChannels) info->nChannels = nChannels; // Set by plugin; override default. + return ncclSuccess; +} + static ncclResult_t getPatternInfo(struct ncclInfo* info) { switch (info->coll) { case ncclFuncBroadcast: @@ -1275,14 +1311,6 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { } static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) { - int collNetTypeSupport = 0; - // Check whether algo and proto have been preset (as in aggregation case) - // If so, skip the calculation - if (info->nChannels > 0 && info->nThreads > 0) goto comp_next; - NCCLCHECK(getCollNetSupport(info, &collNetTypeSupport)); - NCCLCHECK(getAlgoInfo(info, collNetTypeSupport, 1)); - -comp_next: // Set nstepsPerLoop and nchunksPerLoop NCCLCHECK(getPatternInfo(info)); NCCLCHECK(getLoopInfo(info)); @@ -1295,14 +1323,7 @@ comp_next: work->nWarps = info->nThreads / WARP_SIZE; work->redOpArg = info->opFull.scalarArg; work->redOpArgIsPtr = info->opFull.scalarArgIsPtr; - - if (info->comm->nRanks == 1) { - // one-rank reduce index - *workFuncIndex = 1 + int(info->datatype); - return ncclSuccess; - } - - *workFuncIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol); + *workFuncIndex = ncclDevFuncId(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol); int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS; int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1; @@ -1337,6 +1358,7 @@ comp_next: work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_NVLS) { int maxChunkSize = 131072; + if (info->comm->nNodes > 1 && info->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768; if (chunkSize > maxChunkSize) chunkSize = maxChunkSize; // Use uint64_t so that concurrentOps*chunkSize*X does not overflow uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads; @@ -1347,6 +1369,7 @@ comp_next: } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) { // Use uint64_t so that concurrentOps*chunkSize*X does not overflow uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads; + if (info->comm->nNodes >= 4) chunkSize = 65536; if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144; if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072; if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; @@ -1381,7 +1404,7 @@ comp_next: proxyOp->protocol = info->protocol; proxyOp->dtype = info->datatype; proxyOp->redOp = info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum - info->op; + info->opFull.proxyOp; proxyOp->pattern = info->pattern; proxyOp->root = info->root; // This is used by P2P to reduce the receive buffer size. We don't use it in collectives @@ -1399,27 +1422,37 @@ static ncclResult_t hostToDevRedOp( ncclDevRedOpFull *opFull, ncclRedOp_t op, ncclDataType_t datatype, ncclComm *comm ) { union { - int8_t i8; - uint8_t u8; - int32_t i32; - uint32_t u32; - int64_t i64; - uint64_t u64; - half f16; + int8_t i8; uint8_t u8; + int32_t i32; uint32_t u32; + int64_t i64; uint64_t u64; + half f16; float f32; double f64; #if defined(__CUDA_BF16_TYPES_EXIST__) __nv_bfloat16 bf16; #endif - float f32; - double f64; void *ptr; }; u64 = 0; opFull->scalarArgIsPtr = false; + opFull->proxyOp = op; + + int nbits = 8*ncclTypeSize(datatype); + uint64_t allBits = uint64_t(-1)>>(64-nbits); + uint64_t signBit = allBits^(allBits>>1); + switch (int(op)) { case ncclSum: opFull->op = ncclDevSum; break; case ncclProd: opFull->op = ncclDevProd; break; - case ncclMax: opFull->op = ncclDevMax; break; - case ncclMin: opFull->op = ncclDevMin; break; + case ncclMin: + case ncclMax: + opFull->op = ncclDevMinMax; + opFull->scalarArg = 0; + // The xormask used by ncclFuncMinMax<[u]int> is the XOR of the sign bit + // for signed (opposed to unsigned) types and all the bits for max (opposed to min). + if (datatype==ncclInt8 || datatype==ncclInt32 || datatype==ncclInt64) { + opFull->scalarArg ^= signBit; + } + opFull->scalarArg ^= (op == ncclMax) ? allBits : 0; + break; case ncclAvg: switch ((int)datatype) { case ncclInt8: case ncclInt32: case ncclInt64: @@ -1513,12 +1546,8 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* inf struct ncclDevRedOpFull opFull; NCCLCHECK(hostToDevRedOp(&opFull, info->op, info->datatype, comm)); - // User-defined reduction ops may need alter the data even for unitary reductions - if (comm->nRanks == 1 && opFull.op < ncclDevPreMulSum) { - if (info->sendbuff != info->recvbuff) { - size_t bytes = info->count*ncclTypeSize(info->datatype); - CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, bytes, cudaMemcpyDeviceToDevice, info->stream)); - } + if (comm->nRanks == 1) { + NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opFull, info->datatype, info->stream)); return ncclSuccess; } else { // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. diff --git a/src/graph/connect.cc b/src/graph/connect.cc index a71045e628..5af0020eda 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -370,13 +370,18 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c]; treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c]; treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c]; - nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c]; } for (int r=0; rringPrev[c]; ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c]; } } + for (int c=0; cnChannels; c++) { + for (int n=0; nnvlsHeads[c]; + } + } // Connect rings and trees. This should also duplicate the channels. NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext)); diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 450ba658f1..42be5919ed 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -70,7 +70,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT if ((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) { // Find reverse link for (int l=0; lnlinks; l++) { - if (remNode->links[l].remNode == node) { + if (remNode->links[l].remNode == node && remNode->links[l].type == link->type) { remPath->list[0] = remNode->links+l; break; } @@ -126,7 +126,7 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n for (int i=0; ipaths[t][n].count; i++) { struct ncclTopoLink* link = node->paths[t][n].list[i]; struct ncclTopoNode* remNode = link->remNode; - sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id); + sprintf(line+offset, "--%s(%g)->%s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[remNode->type], remNode->id); offset = strlen(line); } INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].bw); @@ -212,14 +212,14 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE if (*level == -1) { int l = -1; if (disableEnv) { - char* str = getenv(disableEnv); + const char* str = ncclGetEnv(disableEnv); if (str) { int disable = strtol(str, NULL, 0); if (disable == 1) l = 0; } } if (l == -1) { - char* str = getenv(levelEnv); + const char* str = ncclGetEnv(levelEnv); if (str) { for (int i=0; i<=PATH_SYS; i++) { if (strcmp(str, topoPathTypeStr[i]) == 0) { @@ -318,14 +318,15 @@ compare: status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite; good &= status == NVML_P2P_STATUS_OK; if (!good) { - if (ncclParamIgnoreDisabledP2p()) { - *p2p = 0; - } else if (path->type <= PATH_NVB) { - WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]); - return ncclUnhandledCudaError; - } else if (path->type < PATH_SYS) { - INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]); + if (!ncclParamIgnoreDisabledP2p()) { + if (path->type <= PATH_NVB) { + WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]); + return ncclUnhandledCudaError; + } else if (path->type < PATH_SYS) { + INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]); + } } + *p2p = 0; } } } @@ -360,7 +361,8 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int if (read) { // For reads (sends) only enable under certain conditions int gdrReadParam = ncclParamNetGdrRead(); if (gdrReadParam == 0) return ncclSuccess; - if (gdrReadParam < 0) { + // Disable GDR Reads pre-Ampere when we have other PCI flows + if (gdrReadParam < 0 && gpu->gpu.cudaCompCap < 80) { int nvlink = 0; // Since we don't know whether there are other communicators, // it's better to keep things local if we have a single GPU. @@ -400,7 +402,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int } // Set to 0 to disable the flush on Hopper when using GDR -NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1); +NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0); // Determine whether we need to flush the GDR recv buffers ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) { diff --git a/src/graph/search.cc b/src/graph/search.cc index dd8896bd20..3ebb0d4204 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -49,10 +49,10 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) { return ncclSuccess; } -static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, struct ncclTopoLink** revLink) { +static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, int type, struct ncclTopoLink** revLink) { for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node2->links+l; - if (link->remNode == node1) { + if (link->remNode == node1 && link->type == type) { *revLink = link; return ncclSuccess; } @@ -85,11 +85,11 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod float fwBw = link->type == LINK_PCI ? pciBw : bw; float revBw = 0; if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) { - if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink)); + if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink)); revBw += fwBw/8; } - if (link->remNode->type == CPU && link->type == LINK_NVL) { - if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink)); + if (link->remNode->type == CPU && link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER && link->type == LINK_NVL) { + if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink)); revBw += fwBw; } if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; } @@ -260,6 +260,32 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc } else { for (int i=0; inodes[NVS].count) { + // NVSwitches prefer when we talk to a limited set of peers. Try to use neighbors first. + int index = gpu-system->nodes[GPU].nodes; + int i; + int prevGpu = (index-1+ngpus)%ngpus; + int nextGpu = (index+1)%ngpus; + int firstGpus[2]; + int firstGpuCount = 0; + if (graph->pattern == NCCL_TOPO_PATTERN_RING) { + firstGpus[0] = nextGpu; firstGpus[1] = prevGpu; firstGpuCount = 2; + } else if (graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE || + graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) { + firstGpus[0] = prevGpu; firstGpus[1] = nextGpu; firstGpuCount = 2; + } else { + firstGpus[0] = nextGpu; firstGpuCount = 1; + } + for (int g=0; g0; i--) next[i] = next[i-1]; + next[0] = firstGpus[g]; + } + } + } + *countPtr = count; return ncclSuccess; } @@ -267,7 +293,7 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time); // Try to keep all searchs within one second -#define NCCL_SEARCH_GLOBAL_TIMEOUT (5ULL<<16) +#define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<19) #define NCCL_SEARCH_TIMEOUT (1<<14) #define NCCL_SEARCH_TIMEOUT_TREE (1<<14) #define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8) @@ -342,6 +368,7 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better. if (graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count) *copy = 1; + if (graph->nChannels*graph->bwInter > refGraph->nChannels*refGraph->bwInter) *copy = 1; return ncclSuccess; } // 2. Try to get better bandwidth @@ -358,30 +385,27 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop return ncclSuccess; } -// Build a list of the best NETs to try. +// Build a sorted list of the NETs to try. // // "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu // index when trying to get back to the NIC. // // The list is built the following way: // 1. Select NETs starting with those close to GPU(s), based on paths[n].type. -// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list -// based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which -// might have been choosen by GPU 0 (case with multiple independent communicators per node) -// 3. Then add the NETs to the final list if they were not already added by another closer GPU. +// 2. add other NETs satisfying typeInter but not already in the list. ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) { int netCount = 0; int localNetCount; int* localNets; - NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count)); + NCCLCHECK(ncclCalloc(&localNets, MAXCHANNELS)); // First add the preferred NICs for (int g=0; gnodes[GPU].count; g++) { if (gpu != -1 && gpu != g) continue; localNetCount = 0; struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; - for (int c = 0;; c++) { + for (int c = 0; cgpu.rank, c, &netId)); NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount)); @@ -451,11 +475,11 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo int startNetIndex; NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex)); struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; - int netcount; + int netCount; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); - NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netcount)); - for (int i=0; itypeInter, g, nets, &netCount)); + for (int i=0; inodes[NET].nodes+n; if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric @@ -523,12 +547,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo const int bw = graph->bwInter; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); - int netcount; - NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount)); - for (int i=0; itypeInter, -1, nets, &netCount)); + for (int i=0; ipattern == NCCL_TOPO_PATTERN_NVLS && i>0) continue; + int n = nets[(graph->nChannels+i)%netCount]; struct ncclTopoNode* net = system->nodes[NET].nodes+n; - struct ncclTopoNode* gpu; if (graph->collNet && net->net.collSupport == 0) continue; if (net->net.bw < bw) continue; @@ -542,12 +566,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } } - // NVLS needs to balance on all NICs if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { - if (graph->nChannels < netcount) { + // NVLS search only tries to find NIC:GPU combinations to compute the heads. + if (graph->nChannels < netCount) { int gpu; - NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[nets[graph->nChannels]].id, &gpu)); - if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu)); + NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu)); + if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu)); } } else { if (graph->nChannels > 0) { @@ -557,7 +581,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); } if (graph->nChannels == 0 || graph->sameChannels == 0) { - if (graph->nChannels == 0) { + if (graph->nChannels == 0 && system->nodes[NVS].count == 0) { // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long int t = 1 << 10; NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0)); @@ -577,18 +601,10 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } } if (maxBw >= bw) { - // In the first loop, avoid using GPUs in both directions between channels (one channel - // sending from that GPU and one channel receiving to that GPU), since that usually leads - // to lower BW. - for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) { - for (int g=0; gnodes[GPU].count; g++) { - if (paths[g].bw == maxBw && paths[g].count == minHops) { - gpu = system->nodes[GPU].nodes+g; - int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1; - if (tryGpuBidir == gpuUsed) { - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); - } - } + for (int i=0; inodes[GPU].count; i++) { + int g = (graph->nChannels+i)%system->nodes[GPU].count; + if (paths[g].bw == maxBw && paths[g].count == minHops) { + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); } } } @@ -804,33 +820,50 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs return ncclSuccess; } +ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin, int ngpus) { + if (graph->nChannels == 0) return ncclSuccess; + if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess; + if (graph->bwIntra < 25.0) return ncclSuccess; + if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess; + + int dupChannels = std::min(graph->nChannels*2, graph->maxChannels); + memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int)); + memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int)); + graph->bwIntra /= DIVUP(dupChannels, graph->nChannels); + graph->bwInter /= DIVUP(dupChannels, graph->nChannels); + graph->nChannels = dupChannels; + return ncclSuccess; +} + float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 }; float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float)) #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float)) -float sm90SpeedArrayIntra[] = { 60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 }; -float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 }; +float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float)) #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float)) ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; - graph->crossNic = ncclParamCrossNic(); - int crossNic = (system->nodes[NET].count > 1) && graph->crossNic && + int crossNic = (system->nodes[NET].count > 1) && (graph->pattern == NCCL_TOPO_PATTERN_RING || - graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE || - graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? 1 : 0; + graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE || + graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0; + graph->crossNic = crossNic == 1 ? 1 : 0; graph->bwIntra = graph->bwInter = 0; graph->latencyInter = 0; - if (graph->crossNic == 2) graph->crossNic = 0; graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; graph->typeInter = PATH_PIX; graph->nChannels = 0; int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1; graph->sameChannels = trySameChannels; - char* str = getenv("NCCL_GRAPH_FILE"); + int cpuArch, cpuVendor, cpuModel; + NCCLCHECK(ncclTopoCpuType(system, &cpuArch, &cpuVendor, &cpuModel)); + + const char* str = ncclGetEnv("NCCL_GRAPH_FILE"); if (str) { INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str); struct ncclXml* xml; @@ -846,6 +879,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph int ccMin; NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL)); if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess; + // NVLS search must have ngpus heads at most. + if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = system->nodes[GPU].count; if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; @@ -884,7 +919,7 @@ search: NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time)); #if 0 - printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : ""); + printf("Id %d Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.id, tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : ""); for (int c=0; cnChannels; c++) { printf("%2d : ", c); for (int g=0; gnChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) { @@ -954,6 +990,7 @@ done: // We have a solution. Start from that solution and move to pass 2. if (pass == 1) { time = -1; + NCCLCHECK(ncclTopoDupChannels(graph, ccMin, ngpus)); memcpy(&tmpGraph, graph, sizeof(tmpGraph)); speedIndex = 0; while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++; @@ -962,13 +999,22 @@ done: pass = 2; } - // 3. See if we can increase bwIntra for trees (2 nodes or collnet) if (pass == 2) { - if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING && - tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 && - speedIndex > 0) { - tmpGraph.bwIntra = speedArray[--speedIndex]; - goto search; + // See if we can increase bw + if (time != 0 && speedIndex > 0) { + if (graph->pattern == NCCL_TOPO_PATTERN_RING) { + // increase bw for Ring + tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[--speedIndex]; + goto search; + } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2) { + tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels; + tmpGraph.bwInter = speedArray[--speedIndex]; + goto search; + } else if (tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2) { + // increase bwIntra for trees (2 nodes or collnet) + tmpGraph.bwIntra = speedArray[--speedIndex]; + goto search; + } } time = -1; memcpy(&tmpGraph, graph, sizeof(tmpGraph)); @@ -982,18 +1028,6 @@ done: graph->typeIntra = graph->typeInter = PATH_SYS; graph->nChannels = 1; } - - if (graph->nChannels == 0) return ncclSuccess; - if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess; - if (graph->bwIntra < 25.0) return ncclSuccess; - if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess; - - int dupChannels = std::min(graph->nChannels*2, graph->maxChannels); - memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int)); - memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int)); - graph->bwIntra /= DIVUP(dupChannels, graph->nChannels); - graph->bwInter /= DIVUP(dupChannels, graph->nChannels); - graph->nChannels = dupChannels; return ncclSuccess; } @@ -1023,7 +1057,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr } ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) { - char* str = getenv("NCCL_GRAPH_DUMP_FILE"); + const char* str = ncclGetEnv("NCCL_GRAPH_DUMP_FILE"); if (str) { INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str); struct ncclXml* xml; diff --git a/src/graph/topo.cc b/src/graph/topo.cc index cdcc0664f7..481def486b 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -72,6 +72,9 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) { if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW; } + if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) { + *bw = AMD_BW; + } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW; } @@ -540,6 +543,36 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* return ncclSuccess; } +ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) { + if (strcmp(node->name, "c2c") == 0) { + struct ncclTopoNode* gpu = NULL; + int64_t pBusId; + NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); + NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId)); + if (gpu == NULL) { + WARN("Add NVLink error : could not find GPU %lx", pBusId); + return ncclInternalError; + } + int count = 0; + NCCLCHECK(xmlGetAttrInt(node, "count", &count)); + int bw = 0; + NCCLCHECK(xmlGetAttrInt(node, "bw", &bw)); + double c2cBw = (bw*count)/1000.0; + struct ncclTopoNode* cpu = NULL; + NCCLCHECK(findLocalCpu(gpu, &cpu)); + if (cpu == NULL) return ncclSuccess; + NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw)); + NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw)); + } else { + const char* busId; + NCCLCHECK(xmlGetAttr(node, "busid", &busId)); + for (int s=0; snSubs; s++) { + NCCLCHECK(ncclTopoAddC2c(node->subs[s], system, busId ? busId : parentBusId)); + } + } + return ncclSuccess; +} + ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) { NCCLCHECK(ncclCalloc(topoSystem, 1)); struct ncclXmlNode* topNode; @@ -549,6 +582,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem)); } NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL)); + NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL)); NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem)); NCCLCHECK(ncclTopoConnectCpus(*topoSystem)); @@ -595,7 +629,7 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { struct ncclXml* xml; NCCLCHECK(ncclCalloc(&xml, 1)); - char* xmlTopoFile = getenv("NCCL_TOPO_FILE"); + const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE"); if (xmlTopoFile) { INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile); NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1)); @@ -668,7 +702,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy // Remove XML branches which don't have a node with keep="1" (typically when importing a topology) NCCLCHECK(ncclTopoTrimXml(xml)); - xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE"); + xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE"); if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) { INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile); NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml)); @@ -704,7 +738,7 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch int* localNets; int localNetCount; NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL)); - int* localGpus; + int* localGpus = NULL; int localGpuCount; NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL)); int net = system->nodes[GPU].nodes[gpu].gpu.dev; @@ -717,17 +751,25 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch } ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) { + int netIndex; + NCCLCHECK(ncclTopoIdToIndex(system, NET, net, &netIndex)); + int* localGpus = NULL; + int localGpuCount; + NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL)); for (int c=0; cnodes[GPU].count; g++) { + for (int lg=0; lgnodes[GPU].nodes+g; int id; NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id)); if (net == id) { *gpuIndex = g; + free(localGpus); return ncclSuccess; } } } + free(localGpus); *gpuIndex = -1; return ncclSuccess; } @@ -836,14 +878,3 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* if (ccMax) *ccMax = max; return ncclSuccess; } - -ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) { - for (int g=0; gnodes[GPU].count; g++) { - if (system->nodes[GPU].nodes[g].gpu.rank == rank) { - *localRank = g; - return ncclSuccess; - } - } - WARN("Could not find local GPU with rank %d", rank); - return ncclInternalError; -} diff --git a/src/graph/topo.h b/src/graph/topo.h index 8951505fd6..b067f2f975 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -18,6 +18,7 @@ #define SM86_NVLINK_BW 12.0 #define PCI_BW 12.0 // PCI Gen3 x16 #define QPI_BW 6.0 +#define AMD_BW 16.0 #define SKL_QPI_BW 10.0 #define ZPI_BW 6.0 #define YONGFENG_ZPI_BW 9.0 diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index a43ea2628c..a97ed9a1ad 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -5,7 +5,7 @@ ************************************************************************/ #include "core.h" -#include "devcomm.h" +#include "device.h" #include "comm.h" #include "topo.h" @@ -54,9 +54,9 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li // Latencies in us, Bandwidths in GB/s // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { - { 6.8, 14.0, 0 }, { 6.6, 14.0, 8.4 }, // Tree, Ring - { 6.8, 14.0, 0 }, { 6.8, 14.0, 0 }, // Collnet Direct, Chain - { 0, 0, 23.0 }, { 0, 0, 23.0 }}; // NVLS, NVLS Tree + { 6.8, 14.0, 0 }, { 6.6, 14.0, 8.4 }, // Tree, Ring + { 0, 0, 0 }, { 0, 0, 0 }, // Collnet Direct, Chain + { 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree // NVLink, PCI, Network #define NCCL_HW_NVLINK 0 @@ -64,17 +64,17 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { #define NCCL_HW_NET 2 static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 4 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 }, - /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 4.75 }, - /* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } }, + { /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 }, + /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 }, + /* NVLS */ { 0, 0, 23 }, /* NVLSTree */ { 0, 0, 23 } }, /* PCI */ - { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 6 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, - /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 }, + { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, + /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 }, /* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 14 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 }, - /* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 14 }, - /* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 19 } } + { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 }, + /* CollNetDirect (Simple)*/ { 0, 0, 31 }, /* CollNetChain (Simple)*/ { 0, 0, 30 }, + /* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 14 } } }; /* Array indexes used below */ @@ -165,13 +165,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom for (int a=0; abwIntra : graphs[a]->bwInter; + if (a == NCCL_ALGO_NVLS) bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter); + if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2); float busBw = graphs[a]->nChannels * bw; // Various model refinements @@ -194,10 +196,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Convert bus BW to algorithm BW float ratio; if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps; - else if (a == NCCL_ALGO_NVLS) ratio = 5.0/6.0; - else if (a == NCCL_ALGO_NVLS_TREE) ratio = .70 * nNodes / (2*(nNodes-1)); + else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio = 5.0/6.0; else ratio = .5; comm->bandwidths[coll][a][p] = busBw * ratio; + /* Ring bandwidth backup */ + if (a == NCCL_ALGO_RING) + comm->ringbdw[coll][p] = comm->bandwidths[coll][NCCL_ALGO_RING][p]; comm->latencies[coll][a][p] = baseLat[a][p]; float intraLat = hwLat[intraHw[a]][a][p]; @@ -229,13 +233,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom 2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat); } else if (a == NCCL_ALGO_COLLNET_DIRECT) { comm->latencies[coll][a][p] += - 2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat; // Add 0.5 arity serialization latency + 2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.4) + interLat; // Add 0.4 us arity serialization latency } else if (a == NCCL_ALGO_COLLNET_CHAIN) { comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat; } else if (a == NCCL_ALGO_NVLS) { - if (nNodes > 1) comm->latencies[coll][a][p] += hwLat[NCCL_HW_NET][a][p]; + comm->latencies[coll][a][p] = intraLat; + if (nNodes > 1) comm->latencies[coll][a][p] += interLat; } else if (a == NCCL_ALGO_NVLS_TREE) { - comm->latencies[coll][a][p] += 2*(nNodes-1)*hwLat[NCCL_HW_NET][a][p]; + comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat; } } } @@ -246,12 +251,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 }; - const char *protoStr = getenv("NCCL_PROTO"); + const char *protoStr = ncclGetEnv("NCCL_PROTO"); if (protoStr) { INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr); NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable)); } - const char *algoStr = getenv("NCCL_ALGO"); + const char *algoStr = ncclGetEnv("NCCL_ALGO"); if (algoStr) { INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr); NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable)); @@ -293,11 +298,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom } } if (pEnable == 0) comm->bandwidths[c][a][p] = 0; - // Never disable ring for non-allreduce operations. That allows to run real apps with NCCL_ALGO=TREE. - if (a == NCCL_ALGO_RING && c != ncclFuncAllReduce) continue; if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; } + for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) { + bool available = false; + for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++) + for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) + if (comm->bandwidths[c][a][p] != 0) { + available = true; + goto check_avail; + } + check_avail: + if (available == false) { + /* at least set ring algo available */ + for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) + comm->bandwidths[c][NCCL_ALGO_RING][p] = comm->ringbdw[c][p]; + } + } + if (comm->rank == 0) { char line[1024]; for (int block=0; block<2; block++) { @@ -346,7 +365,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = 512; // Override defaults with user env - char* str = getenv("NCCL_THREAD_THRESHOLDS"); + const char* str = ncclGetEnv("NCCL_THREAD_THRESHOLDS"); if (str) { INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str); ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }}; @@ -378,9 +397,19 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = { { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .6, .7, .8, .7, .7, .8, .9, .9 } }; -ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) { - float bw = info->comm->bandwidths[info->coll][algorithm][protocol]; +ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup) { + float bw = info->comm->bandwidths[info->coll][algorithm][protocol]; float lat = info->comm->latencies[info->coll][algorithm][protocol]; + + if (backup) { + *backup = false; + if (algorithm == NCCL_ALGO_RING && bw == 0.0f) { + /* try back up RING algorithm */ + bw = info->comm->ringbdw[info->coll][protocol]; + *backup = true; + } + } + if (bw == 0) { *time = -1.0; return ncclSuccess; } diff --git a/src/graph/xml.cc b/src/graph/xml.cc index ac862a4e06..47fda1f851 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -254,9 +254,13 @@ ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclX return ncclSuccess; } +ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); + return ncclSuccess; +} ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { - struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink } }; - NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); + struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink }, { "c2c", ncclTopoXmlLoadC2c } }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2)); return ncclSuccess; } @@ -687,6 +691,41 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm } } } +#if CUDART_VERSION >= 11080 + struct ncclXmlNode* c2cNode = NULL; + NCCLCHECK(xmlGetSub(gpuNode, "c2c", &c2cNode)); + if (c2cNode == NULL) { + if (sm >= 90) { + int c2cLinksCount = 0; + nvmlFieldValue_t fv; + fv.fieldId = NVML_FI_DEV_C2C_LINK_COUNT; + if ((ncclNvmlDeviceGetFieldValues(nvmlDev, 1, &fv) == ncclSuccess) && (fv.nvmlReturn == NVML_SUCCESS)) { + c2cLinksCount = fv.value.uiVal; + int bw = 0; + int count = 0; + for (int l=0; l 0) { + NCCLCHECK(xmlAddNode(xml, gpuNode, "c2c", &c2cNode)); + NCCLCHECK(xmlSetAttrInt(c2cNode, "bw", bw)); + NCCLCHECK(xmlSetAttrInt(c2cNode, "count", count)); + } + } + } + } +#endif // Fill target classes for (int s=0; snSubs; s++) { struct ncclXmlNode* sub = gpuNode->subs[s]; diff --git a/src/group.cc b/src/group.cc index a889c060cb..29400d6bcb 100644 --- a/src/group.cc +++ b/src/group.cc @@ -22,7 +22,6 @@ __thread int ncclGroupBlocking = -1; /* default mode */ __thread bool ncclGroupJobAbortFlag = false; void* ncclAsyncJobMain(void* arg); -static ncclResult_t groupJobComplete(struct ncclGroupJob *job); ncclResult_t ncclAsyncLaunch( struct ncclAsyncJob* job, @@ -181,9 +180,28 @@ failure: return result; } -static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue* asyncJobsPtr, ncclResult_t* groupErrorPtr, ncclResult_t error) { +static inline void groupResetJobState(struct ncclGroupJob* job) { + if (job) { + if (job->groupBlockingPtr) *job->groupBlockingPtr = -1; + if (job->abortFlagPtr) *job->abortFlagPtr = false; + if (job->groupErrorPtr) *job->groupErrorPtr = ncclSuccess; + if (job->groupCommHeadPtr) *job->groupCommHeadPtr = NULL; + if (job->groupCommPreconnectHeadPtr) *job->groupCommPreconnectHeadPtr = NULL; + memset(job, 0, sizeof(struct ncclGroupJob)); + } + return; +} + +static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue* asyncJobsPtr, ncclResult_t* groupErrorPtr, int* groupBlockingPtr, volatile bool* groupJobAbortFlagPtr, ncclResult_t error) { struct ncclComm* comm = *groupCommHeadPtr; + /* reset all thread local variables */ + *groupCommHeadPtr = NULL; + *groupCommPreconnectHeadPtr = NULL; + *groupErrorPtr = ncclSuccess; + *groupBlockingPtr = -1; + *groupJobAbortFlagPtr = false; + while (comm != nullptr) { struct ncclComm* next = comm->groupNext; (void) ncclGroupCommLeave(comm); // overwrites comm->groupNext @@ -233,16 +251,12 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g /* reset everything */ while (!ncclIntruQueueEmpty(asyncJobsPtr)) { struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr); - *job->abortFlag = 1; if (job->comm && !job->comm->config.blocking) (void) ncclCommSetAsyncError(job->comm, error); if (job->undo) job->undo(job); if (job->destructor) job->destructor((void*)job); } - *groupErrorPtr = ncclSuccess; - *groupCommHeadPtr = nullptr; - *groupCommPreconnectHeadPtr = nullptr; return; } @@ -325,9 +339,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) { NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail); } - /* this atomic must happen before cleanup and setting state of communicators */ - __atomic_store_n(&gjob->doneFlag, true, __ATOMIC_RELEASE); - while (!ncclIntruQueueEmpty(asyncJobsMain)) { struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain); if (job->comm && !job->comm->config.blocking) @@ -345,16 +356,12 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) { groupCommHeadMain = next; } - *gjob->groupErrorPtr = ncclSuccess; - *gjob->groupCommHeadPtr = nullptr; - *gjob->groupCommPreconnectHeadPtr = nullptr; - CUDACHECK(cudaSetDevice(savedDev)); exit: return ret; fail: - groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, ret); + groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, gjob->groupBlockingPtr, gjob->abortFlagPtr, ret); goto exit; } @@ -377,7 +384,8 @@ ncclResult_t ncclGroupEndInternal() { ncclGroupJobMain.groupErrorPtr = &ncclGroupError; ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs; ncclGroupJobMain.abortFlagPtr = &ncclGroupJobAbortFlag; - ncclGroupJobMain.doneFlag = false; + ncclGroupJobMain.groupBlockingPtr = &ncclGroupBlocking; + ncclGroupJobMain.initialized = true; ncclGroupJobMainPtr = &ncclGroupJobMain; /* make sure ncclGroupBlocking has been set. */ assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1); @@ -387,6 +395,7 @@ ncclResult_t ncclGroupEndInternal() { ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs); do { NCCLCHECKGOTO(ncclCommSetAsyncError(job->comm, ncclInProgress), ret, fail); + job->comm->groupJob = ncclGroupJobMainPtr; job = job->next; } while (job); } @@ -395,30 +404,42 @@ ncclResult_t ncclGroupEndInternal() { ncclComm_t comm = ncclGroupCommHead; do { NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail); + /* link group job to communicators. */ + comm->groupJob = ncclGroupJobMainPtr; comm = comm->groupNext; } while (comm); } + ncclGroupJobMainPtr->base.func = groupLaunch; SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail); ret = ncclInProgress; } else { /* blocking group */ NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail); - groupResetJobState(); + groupResetJobState(ncclGroupJobMainPtr); } } exit: return ret; fail: - groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, ret); - groupResetJobState(); + groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, &ncclGroupBlocking, &ncclGroupJobAbortFlag, ret); goto exit; } -void ncclGroupJobAbort() { - ncclGroupJobAbortFlag = true; - (void) groupJobComplete(ncclGroupJobMainPtr); - /* reset group abort flag */ - ncclGroupJobAbortFlag = false; +ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) { + ncclResult_t ret = ncclSuccess; + if (groupJob && groupJob->initialized) { + ret = ncclAsyncJobComplete(&groupJob->base); + groupResetJobState(groupJob); + } + return ret; +} + +ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) { + if (groupJob && groupJob->initialized) { + *groupJob->abortFlagPtr = true; + NCCLCHECK(ncclGroupJobComplete(groupJob)); + } + return ncclSuccess; } diff --git a/src/include/alloc.h b/src/include/alloc.h index caa9da9855..f8d954469e 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -101,7 +101,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand /* Allocate the physical memory on the device */ CUCHECK(cuMemCreate(&handle, size, &prop, 0)); /* Reserve a virtual address range */ - CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); + CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); /* Map the virtual address range to the physical allocation */ CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); /* Now allow RW access to the newly mapped memory */ diff --git a/src/include/collectives.h b/src/include/collectives.h index b9100a22a0..0f965276a4 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -7,108 +7,7 @@ #ifndef NCCL_COLLECTIVES_H_ #define NCCL_COLLECTIVES_H_ -enum ncclDevRedOp_t { - ncclDevSum, ncclDevProd, ncclDevMax, ncclDevMin, - ncclDevPreMulSum, ncclDevSumPostDiv, - ncclNumDevRedOps -}; -struct ncclDevRedOpFull { - ncclDevRedOp_t op; - bool scalarArgIsPtr; - uint64_t scalarArg; -}; - -#define FUNC_INDEX_P2P 0 -#define FUNC_INDEX(func, devredop, ncclType, al, pr) (1+ncclNumTypes+(((((func)*ncclNumDevRedOps + (devredop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)) - -#define NCCL_FUNC_NAME(func, algo, proto, devredop, type) \ - ncclFunction_##func##_##algo##_##proto##_##devredop##_##type - -#define NCCL_ONERANK_REDUCE_NAME(devredop, type) \ - ncclFunction_OneRankReduce_##devredop##_##type - -#define NCCL_KERN_NAME(func, algo, proto, devredop, type) \ - ncclKernel_##func##_##algo##_##proto##_##devredop##_##type - -#define NCCL_IMPL_NAME(func, algo, proto) \ - nccl##func##algo##proto - -/* Declare all collective operations */ -#define DECL5(func, algo, proto, devredop, type) \ - extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \ - extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \ - -#define SINGLE_ARG(...) __VA_ARGS__ -#define CONCAT(a,b) a##b -#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(SINGLE_ARG(t), SINGLE_ARG(f)) -#define MACRO_IF_0(t, f) f -#define MACRO_IF_1(t, f) t - -#define DECL4(func, algo, devredop, type, undef) \ - MACRO_IF(undef, /*undefined*/, DECL5(func, algo, SIMPLE, devredop, type)) \ - MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL, devredop, type)) \ - MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128, devredop, type)) - -#define DECL3(func, devredop, type, undef) \ - DECL4(func, RING, devredop, type, undef) \ - DECL4(func, TREE, devredop, type, undef) \ - DECL4(func, COLLNET_DIRECT, devredop, type, undef) \ - DECL4(func, COLLNET_CHAIN, devredop, type, undef) \ - DECL4(func, NVLS, devredop, type, undef) \ - DECL4(func, NVLS_TREE, devredop, type, undef) - -#if defined(__CUDA_BF16_TYPES_EXIST__) -#define DECL2(func, devredop, undefForFloat) \ - DECL3(func, devredop, int8_t, /*undef=*/0) \ - DECL3(func, devredop, uint8_t, /*undef=*/0) \ - DECL3(func, devredop, int32_t, /*undef=*/0) \ - DECL3(func, devredop, uint32_t, /*undef=*/0) \ - DECL3(func, devredop, int64_t, /*undef=*/0) \ - DECL3(func, devredop, uint64_t, /*undef=*/0) \ - DECL3(func, devredop, half, /*undef=*/undefForFloat) \ - DECL3(func, devredop, float, /*undef=*/undefForFloat) \ - DECL3(func, devredop, double, /*undef=*/undefForFloat) \ - DECL3(func, devredop, __nv_bfloat16, /*undef=*/undefForFloat) -#else -#define DECL2(func, devredop, undefForFloat) \ - DECL3(func, devredop, int8_t, /*undef=*/0) \ - DECL3(func, devredop, uint8_t, /*undef=*/0) \ - DECL3(func, devredop, int32_t, /*undef=*/0) \ - DECL3(func, devredop, uint32_t, /*undef=*/0) \ - DECL3(func, devredop, int64_t, /*undef=*/0) \ - DECL3(func, devredop, uint64_t, /*undef=*/0) \ - DECL3(func, devredop, half, /*undef=*/undefForFloat) \ - DECL3(func, devredop, float, /*undef=*/undefForFloat) \ - DECL3(func, devredop, double, /*undef=*/undefForFloat) -#endif - -#define DECL(func) \ - DECL2(func, Sum, /*undefForFloat=*/0) \ - DECL2(func, Prod, /*undefForFloat=*/0) \ - DECL2(func, Min, /*undefForFloat=*/0) \ - DECL2(func, Max, /*undefForFloat=*/0) \ - DECL2(func, PreMulSum, /*undefForFloat=*/0) \ - DECL2(func, SumPostDiv, /*undefForFloat=*/1) - -DECL2(Broadcast, Sum, /*undefForFloat=*/0) -DECL(Reduce) -DECL2(AllGather, Sum, /*undefForFloat=*/0) -DECL(ReduceScatter) -DECL(AllReduce) -DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) - -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)(); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)(); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)(); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)(); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)(); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)(); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)(); -#if defined(__CUDA_BF16_TYPES_EXIST__) -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16)(); -#endif -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)(); -extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)(); +#include "nccl.h" // CHUNKSIZE must be a multiple of SLICESIZE #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) @@ -123,13 +22,27 @@ extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)(); #define REDUCE_CHUNKSTEPS 1 #define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above -// We can't use the enum identifiers like ncclSum, ncclFloat, etc since this -// macro will be used in preprocessor conditionals where enums have no meaning. -#define NCCL_NVLS_SUPPORTS(/*ncclDataType_t*/ type, /*ncclDevRedOp_t*/ red) \ - (((type==2 || type==3) && (red==0 || red==2 || red==3)) || \ - ((type==4 || type==5) && (red==0 || red==2 || red==3)) || \ - ((type==6 || type==9) && (red==0 || red==2 || red==3)) || \ - (type==7 && red==0) || \ - (type==8 && red==0)) +inline int ncclTypeSize(ncclDataType_t type) { + switch (type) { + case ncclInt8: + case ncclUint8: + return 1; + case ncclFloat16: + #if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + #endif + return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: + return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: + return 8; + default: + return -1; + } +} #endif diff --git a/src/include/comm.h b/src/include/comm.h index 8986f9349c..bc5a9c5683 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -10,8 +10,10 @@ #include "transport.h" #include "p2p.h" #include "collectives.h" +#include "nccl_tuner.h" #include "proxy.h" #include "strongstream.h" +#include "nccl_net.h" #if CUDART_VERSION < 9000 struct cudaLaunchParams { @@ -125,7 +127,7 @@ struct ncclChannel { struct ncclChannelPeer** peers; struct ncclDevChannelPeer** devPeers; /* devPeer pointer array used for host side access */ - struct ncclDevChannelPeer** devPeersHostPtr; + struct ncclDevChannelPeer** devPeersHostPtr; struct ncclRing ring; int* devRingUserRanks; struct ncclTree tree; @@ -155,6 +157,14 @@ struct ncclPointerList { void *ptr; }; +struct ncclNvlsMcHandleList { + struct ncclNvlsMcHandleList *next; + CUmemGenericAllocationHandle mcHandle; + CUdeviceptr ptr; + int dev; + size_t size; +}; + struct ncclKernelPlan { // A kernel plan is also a callback that reclaims itself. Hence this must // be the first member. @@ -178,6 +188,7 @@ struct ncclKernelPlan { int collOpCount; // zero based for this plan struct ncclIntruQueue ipcMemQueue; + struct ncclIntruQueue nvlsMcHandleQueue; struct Channel { int nWork; @@ -191,6 +202,23 @@ struct ncclKernelPlan { } channels[MAXCHANNELS]; }; +struct ncclRegRequest { + uintptr_t buff; + size_t size; + struct ncclRegRequest *next; +}; + +struct ncclRegRecord { + uintptr_t buff; + size_t size; + CUdeviceptr regAddr; + size_t regSize; + int dev; + CUmemGenericAllocationHandle mcHandle; + uintptr_t *addrs; /* use to check if NVLS buffers match among intra-node ranks */ + struct ncclRegRecord *next; +}; + struct ncclComm { struct ncclMemoryStack memPermanent, memScoped; // List of destructors to run when comm is destructed @@ -261,6 +289,7 @@ struct ncclComm { ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS]; int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; /* This attribute can indicate the states of communicators and return code of @@ -270,7 +299,7 @@ struct ncclComm { // Flag to ask NCCL kernels to abort volatile uint32_t *abortFlag; volatile uint32_t *childAbortFlag; - uint32_t *abortFlagRefCount; + volatile uint32_t *abortFlagRefCount; // Device side of the communicator (for cudaFree's) struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm @@ -310,15 +339,19 @@ struct ncclComm { // NVLink SHARP (NVLS) support int nvlsSupport; + int nvlsRegSupport; /* sharable NVLS resource. */ struct ncclNvlsSharedRes* nvlsResources; + struct ncclShmemCollBuff nvlsShmem; + void *nvlsShmemHandle; - size_t channelSize; // User requested work size (bytes) for channel partitions + ssize_t channelSize; // User requested work size (bytes) for channel partitions // pools backed by comm->memPermanent struct ncclMemoryPool memPool_ncclProxyOp; struct ncclMemoryPool memPool_ncclKernelPlan; struct ncclMemoryPool memPool_ncclPointerList; + struct ncclMemoryPool memPool_ncclNvlsHandleList; // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when // this comm is not yet in a group. struct ncclComm* groupNext; @@ -346,6 +379,16 @@ struct ncclComm { bool finalizeCalled; // shared structures for finalization int finalizeRankCnt; + // group job to support multi-thread FT + struct ncclGroupJob *groupJob; + + /* store to buffer register request */ + struct ncclIntruQueue regRequestQueue; + /* store registered buffer */ + struct ncclIntruQueue regRecordQueue; + + // Tuning plugin + ncclTuner_t* tuner; }; enum ncclLaunchMode { diff --git a/src/include/core.h b/src/include/core.h index ac6ea77f2d..a1754beeb1 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -30,29 +30,6 @@ ret func(args) #endif // end PROFAPI -static __inline__ int ncclTypeSize(ncclDataType_t type) { - switch (type) { - case ncclInt8: - case ncclUint8: - return 1; - case ncclFloat16: -#if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: -#endif - return 2; - case ncclInt32: - case ncclUint32: - case ncclFloat32: - return 4; - case ncclInt64: - case ncclUint64: - case ncclFloat64: - return 8; - default: - return -1; - } -} - #include "debug.h" #include "checks.h" #include "cudawrap.h" diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h index da9ce45a4f..cc363c1ac7 100644 --- a/src/include/cudawrap.h +++ b/src/include/cudawrap.h @@ -30,7 +30,7 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void if( err != CUDA_SUCCESS ) { \ const char *errStr; \ (void) pfn_cuGetErrorString(err, &errStr); \ - WARN("Cuda failure '%s'", errStr); \ + WARN("Cuda failure %d '%s'", err, errStr); \ return ncclUnhandledCudaError; \ } \ } while(false) @@ -40,7 +40,7 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void if( err != CUDA_SUCCESS ) { \ const char *errStr; \ (void) pfn_cuGetErrorString(err, &errStr); \ - WARN("Cuda failure '%s'", errStr); \ + WARN("Cuda failure %d '%s'", err, errStr); \ res = ncclUnhandledCudaError; \ goto label; \ } \ @@ -52,7 +52,7 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void if( err != CUDA_SUCCESS ) { \ const char *errStr; \ (void) pfn_cuGetErrorString(err, &errStr); \ - INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr); \ + INFO(NCCL_ALL,"%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, err, errStr); \ } \ } while(false) @@ -79,6 +79,7 @@ DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000); DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000); DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000); DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000); +DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000); // cuMem API support DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020); diff --git a/src/include/debug.h b/src/include/debug.h index cd6e53b92b..eb5189058f 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -4,10 +4,11 @@ * See LICENSE.txt for license information ************************************************************************/ -#ifndef NCCL_DEBUG_H_ -#define NCCL_DEBUG_H_ +#ifndef NCCL_INT_DEBUG_H_ +#define NCCL_INT_DEBUG_H_ -#include "nccl_net.h" +#include "nccl.h" +#include "nccl_common.h" #include #include #include diff --git a/src/include/devcomm.h b/src/include/device.h similarity index 78% rename from src/include/devcomm.h rename to src/include/device.h index d4762b8b04..56f8039f30 100644 --- a/src/include/devcomm.h +++ b/src/include/device.h @@ -8,31 +8,33 @@ #define NCCL_DEVICE_H_ #include "nccl.h" +#include "nccl_common.h" #include "align.h" #include -#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now -typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t; extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS]; -#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* -#define NCCL_ALGO_TREE 0 -#define NCCL_ALGO_RING 1 -#define NCCL_ALGO_COLLNET_DIRECT 2 -#define NCCL_ALGO_COLLNET_CHAIN 3 -#define NCCL_ALGO_NVLS 4 -#define NCCL_ALGO_NVLS_TREE 5 extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS]; -#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 -#define NCCL_PROTO_LL 0 -#define NCCL_PROTO_LL128 1 -#define NCCL_PROTO_SIMPLE 2 extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS]; #define NCCL_MAX_OPS 2048 #define NCCL_STEPS 8 +#include "net_device.h" + +enum ncclDevRedOp_t { + ncclDevSum, ncclDevProd, ncclDevMinMax, + ncclDevPreMulSum, ncclDevSumPostDiv, + ncclNumDevRedOps +}; +struct ncclDevRedOpFull { + ncclDevRedOp_t op; + ncclRedOp_t proxyOp; + bool scalarArgIsPtr; + uint64_t scalarArg; +}; + union ncclLLFifoLine { /* Flags have to be *after* data, because otherwise, an incomplete receive from the network may receive the flag but not the data. @@ -85,6 +87,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK struct ncclConnInfo { // Regular comm mechanism char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send + void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t *tail; // Local for recv, remote for send uint64_t *head; // Local for send, remote for recv @@ -98,6 +101,7 @@ struct ncclConnInfo { uint64_t step; // Keep where we are uint64_t llLastCleaning; + ncclNetDeviceHandle_t netDeviceHandle; }; struct ncclProxyConnector { @@ -105,6 +109,7 @@ struct ncclProxyConnector { int tpLocalRank; int sameProcess; struct ncclProxyConnection* connection; + ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary }; struct ncclConnector { @@ -292,6 +297,7 @@ struct ncclDevComm { int rank; int nRanks; int buffSizes[NCCL_NUM_PROTOCOLS]; + int p2pChunkSize; // Operation list for aggregation int workFifoDepth; @@ -370,4 +376,88 @@ __host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE); } +// Host-side table of kernel function pointers. +extern int const ncclDevKernelCount; +extern void* const ncclDevKernelList[/*ncclDevKernelCount*/]; + +// Table of most specialized kernel function to run given func index. +extern int const ncclDevFuncRowToId[]; +extern void* const ncclDevKernelForFunc[/*funcIndex*/]; +extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/]; + +// Launch a one-rank reduction on stream. +ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t type, cudaStream_t stream); + +// `ncclNvlsSupported()` needs to be in sync with "func_valid" in "src/device/generate.py" +inline bool ncclNvlsSupported(int devRedOp, int type) { + switch (type) { + case ncclInt32: + case ncclUint32: + case ncclInt64: + case ncclUint64: + case ncclFloat16: + #if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + #endif + return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax; + case ncclFloat: + case ncclDouble: + return devRedOp == ncclDevSum; + default: + return false; + } +} + +// `ncclDevFuncIndex()` needs to be in sync with "all_functions()" in "src/device/generate.py" +inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) { + #if defined(__CUDA_BF16_TYPES_EXIST__) + constexpr int NumTypes = ncclNumTypes; + #else + constexpr int NumTypes = ncclNumTypes + 1; + #endif + + int row = 0; // ncclDevFuncIndex_P2p + if (coll == ncclFuncSendRecv) goto have_row; + row += 1; + + if (coll == ncclFuncAllGather) { + int algo1 = algo == NCCL_ALGO_RING ? 0 : + /*algo == NCCL_ALGO_NVLS*/ 1; + row += algo1*NCCL_NUM_PROTOCOLS + proto; + goto have_row; + } + row += (/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS; + + if (coll == ncclFuncBroadcast) { + row += proto; + goto have_row; + } + row += (/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS; + + if (coll == ncclFuncAllReduce) { + row += ((devRedOp*NumTypes + type)*NCCL_NUM_ALGORITHMS + algo)*NCCL_NUM_PROTOCOLS + proto; + goto have_row; + } + row += ncclNumDevRedOps*NumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS; + + if (coll == ncclFuncReduce) { + row += (devRedOp*NumTypes + type)*NCCL_NUM_PROTOCOLS + proto; + goto have_row; + } + row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS; + + if (coll == ncclFuncReduceScatter) { + int algo1 = algo == NCCL_ALGO_RING ? 0 : + /*algo == NCCL_ALGO_NVLS*/ 1; + row += ((devRedOp*NumTypes + type)*2 + algo1)*NCCL_NUM_PROTOCOLS + proto; + goto have_row; + } + row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS; + +have_row: + return ncclDevFuncRowToId[row]; +} + +inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[0]; } + #endif diff --git a/src/include/graph.h b/src/include/graph.h index ae524397da..fdd634894d 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -8,7 +8,7 @@ #define NCCL_GRAPH_H_ #include "nccl.h" -#include "devcomm.h" +#include "device.h" #include #include #include @@ -38,7 +38,6 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net); int ncclPxnDisable(struct ncclComm* comm); ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); -ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank); // Find CPU affinity ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity); @@ -112,6 +111,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs); #include "info.h" -ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time); +ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL); #endif diff --git a/src/include/group.h b/src/include/group.h index 9b5ea9c475..72251147f5 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -14,7 +14,8 @@ ncclResult_t ncclGroupErrCheck(ncclResult_t ret); void ncclGroupCommJoin(struct ncclComm* comm); void ncclGroupCommPreconnect(struct ncclComm* comm); ncclResult_t ncclGroupCommLeave(struct ncclComm* comm); -void ncclGroupJobAbort(); +ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob); +ncclResult_t ncclGroupJobComplete(struct ncclGroupJob *groupJob); typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); @@ -52,8 +53,9 @@ struct ncclGroupJob { struct ncclComm **groupCommPreconnectHeadPtr; ncclResult_t *groupErrorPtr; volatile bool *abortFlagPtr; + int *groupBlockingPtr; struct ncclIntruQueue *asyncJobsPtr; - bool doneFlag; + bool initialized; }; ncclResult_t ncclGroupStartInternal(); @@ -87,14 +89,6 @@ static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) { } inline ncclResult_t ncclGroupStartInternal() { - /* if previous group launch does not complete, don't launch this one. */ - if (ncclGroupJobMainPtr != NULL) { - if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) { - return ncclInvalidUsage; - } else { - NCCLCHECK(groupJobComplete(ncclGroupJobMainPtr)); - } - } ncclGroupDepth++; return ncclSuccess; } diff --git a/src/include/ibvcore.h b/src/include/ibvcore.h index 00a6b6f60b..8d8ecf1ec8 100644 --- a/src/include/ibvcore.h +++ b/src/include/ibvcore.h @@ -1040,4 +1040,19 @@ static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struc return qp->context->ops.post_send(qp, wr, bad_wr); } +struct ibv_ece { + /* + * Unique identifier of the provider vendor on the network. + * The providers will set IEEE OUI here to distinguish + * itself in non-homogenius network. + */ + uint32_t vendor_id; + /* + * Provider specific attributes which are supported or + * needed to be enabled by ECE users. + */ + uint32_t options; + uint32_t comp_mask; +}; + #endif // NCCL_IBV_CORE_H_ diff --git a/src/include/ibvsymbols.h b/src/include/ibvsymbols.h index 7cf1e08d8c..906b0df747 100644 --- a/src/include/ibvsymbols.h +++ b/src/include/ibvsymbols.h @@ -36,6 +36,8 @@ struct ncclIbvSymbols { int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); int (*ibv_internal_destroy_qp)(struct ibv_qp *qp); const char * (*ibv_internal_event_type_str)(enum ibv_event_type event); + int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece); + int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece); }; /* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */ diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h index d1c7d08e71..c3709584c3 100644 --- a/src/include/ibvwrap.h +++ b/src/include/ibvwrap.h @@ -66,6 +66,8 @@ static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp); +ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); +ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ diff --git a/src/include/info.h b/src/include/info.h index 5802f3e58d..f65ed2e698 100644 --- a/src/include/info.h +++ b/src/include/info.h @@ -8,7 +8,7 @@ #define NCCL_INFO_H_ #include "nccl.h" -#include "devcomm.h" +#include "device.h" #include "collectives.h" #include "core.h" #include "utils.h" @@ -54,6 +54,8 @@ struct ncclInfo { int nChannels; int nThreads; size_t nBytes; + size_t sendbuffSize; + size_t recvbuffSize; int nstepsPerLoop; int nchunksPerLoop; int chunkSize; @@ -67,6 +69,17 @@ inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) { info->datatype = ncclInt8; } if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank + + /* compute buffer size for NVLS buffer registration */ + if (info->coll == ncclFuncAllGather) { + info->sendbuffSize = info->count * ncclTypeSize(info->datatype); + info->recvbuffSize = info->sendbuffSize * nRanks; + } else if (info->coll == ncclFuncReduceScatter) { + info->recvbuffSize = info->count * ncclTypeSize(info->datatype); + info->sendbuffSize = info->recvbuffSize * nRanks; + } else { + info->sendbuffSize = info->recvbuffSize = info->count * ncclTypeSize(info->datatype); + } return ncclSuccess; } diff --git a/src/include/ipcsocket.h b/src/include/ipcsocket.h index 700f0bcdeb..ccecde84c7 100644 --- a/src/include/ipcsocket.h +++ b/src/include/ipcsocket.h @@ -30,6 +30,7 @@ struct ncclIpcSocket { ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag); ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle); +ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd); ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd); ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash); diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h new file mode 100644 index 0000000000..a37ac203ea --- /dev/null +++ b/src/include/nccl_common.h @@ -0,0 +1,33 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_DEBUG_H_ +#define NCCL_DEBUG_H_ + +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys; + +typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); + +#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now +typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t; + +#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* +#define NCCL_ALGO_UNDEF -1 +#define NCCL_ALGO_TREE 0 +#define NCCL_ALGO_RING 1 +#define NCCL_ALGO_COLLNET_DIRECT 2 +#define NCCL_ALGO_COLLNET_CHAIN 3 +#define NCCL_ALGO_NVLS 4 +#define NCCL_ALGO_NVLS_TREE 5 + +#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 +#define NCCL_PROTO_UNDEF -1 +#define NCCL_PROTO_LL 0 +#define NCCL_PROTO_LL128 1 +#define NCCL_PROTO_SIMPLE 2 + +#endif diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h index a387e66d7a..9b3e6719fc 100644 --- a/src/include/nccl_net.h +++ b/src/include/nccl_net.h @@ -8,6 +8,8 @@ #define NCCL_NET_H_ #include "nccl.h" +#include "nccl_common.h" +#include "net_device.h" #include #define NCCL_NET_HANDLE_MAXSIZE 128 @@ -17,13 +19,89 @@ #define NCCL_PTR_DMABUF 0x4 // Maximum number of requests per comm object -#define NCCL_NET_MAX_REQUESTS 8 +#define NCCL_NET_MAX_REQUESTS 32 -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; -typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys; +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload +} ncclNetProperties_v7_t; -typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); +typedef ncclNetProperties_v7_t ncclNetProperties_t; +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); +} ncclNet_v7_t; + +typedef ncclNet_v7_t ncclNet_t; + +#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v7 + +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v7 + +#define NCCL_NET_MAX_REQUESTS_V6 8 + +// v6 struct for backwards compatibility typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. @@ -35,9 +113,7 @@ typedef struct { float latency; // Network latency int maxComms; // Maximum number of comms we can create int maxRecvs; // Maximum number of grouped receives. -}ncclNetProperties_v6_t; - -typedef ncclNetProperties_v6_t ncclNetProperties_t; +} ncclNetProperties_v6_t; typedef struct { // Name of the network (mainly for logs) @@ -86,10 +162,49 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v6_t; -typedef ncclNet_v6_t ncclNet_t; +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v7_t; -#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6 +typedef ncclCollNet_v7_t ncclCollNet_t; +// v6 struct for backwards compatibility typedef struct { // Name of the collective network (mainly for logs) const char* name; @@ -130,10 +245,6 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v6_t; -typedef ncclCollNet_v6_t ncclCollNet_t; - -#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6 - // v5 struct for backwards compatibility typedef struct { // Name of the network (mainly for logs) @@ -219,95 +330,4 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v5_t; -// v4 struct for backwards compatibility -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA - int speed; // Port speed in Mbps. - int port; // Port number. - int maxComms; // Maximum number of comms we can create -} ncclNetProperties_v4_t; - -// v4 struct for backwards compatibility -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - ncclResult_t (*connect)(int dev, void* handle, void** sendComm); - // Finalize connection establishment after remote peer has called connectHandle - ncclResult_t (*accept)(void* listenComm, void** recvComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v4_t; - -// v4 struct for backwards compatibility -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v4_t; - #endif // end include guard diff --git a/src/include/nccl_tuner.h b/src/include/nccl_tuner.h new file mode 100644 index 0000000000..b4a696e385 --- /dev/null +++ b/src/include/nccl_tuner.h @@ -0,0 +1,55 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TUNER_H_ +#define NCCL_TUNER_H_ + +#include "nccl.h" +#include "nccl_common.h" + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // nNodes: number of nodes in current communicator. + // logFunction: a logFunction can be useful to integrate logging together with NCCL core. + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - collNetTypeSupport: whether collnet supports this type + // - nvlsTypeSupport: whether nvlink sharp supports this time + // - numPipeOps: number of operations in the group + // + // Outputs: + // - algorithm: selected algorithm to be used for the given collective + // - protocol: selected protocol to be used for the given collective + // - nChannels: number of channels (hence SMs) to be used. + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes, + int collNetSupport, int nvlsSupport, int numPipeOps, + int *algorithm, int *protocol, int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + ncclResult_t (*destroy)(); +} ncclTuner_v1_t; + +typedef ncclTuner_v1_t ncclTuner_t; + +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1" + +#endif diff --git a/src/include/net_device.h b/src/include/net_device.h new file mode 100644 index 0000000000..8f7c0d6e1e --- /dev/null +++ b/src/include/net_device.h @@ -0,0 +1,29 @@ +/************************************************************************* + * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_NET_DEVICE_H_ +#define NCCL_NET_DEVICE_H_ + +#define NCCL_NET_DEVICE_INVALID_VERSION 0x0 +#define NCCL_NET_MTU_SIZE 4096 + +// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin +// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. +#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 + +typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; + +typedef struct { + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + void* handle; + size_t size; + int needsProxyProgress; +} ncclNetDeviceHandle_v7_t; + +typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t; + +#endif diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h index fa1f5cf835..2ab8e3a2b0 100644 --- a/src/include/nvmlwrap.h +++ b/src/include/nvmlwrap.h @@ -160,7 +160,12 @@ typedef union nvmlValue_st #define NVML_FI_DEV_NVLINK_GET_SPEED 164 #define NVML_FI_DEV_NVLINK_GET_STATE 165 #define NVML_FI_DEV_NVLINK_GET_VERSION 166 -#define NVML_FI_MAX 167 //!< One greater than the largest field ID defined above + +#define NVML_FI_DEV_C2C_LINK_COUNT 170 //!< Number of C2C Links present on the device +#define NVML_FI_DEV_C2C_LINK_GET_STATUS 171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE +#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW 172 //!< C2C Link Speed in MBps for active links + +#define NVML_FI_MAX 173 //!< One greater than the largest field ID defined above /** * Information for a Field Value Sample diff --git a/src/include/p2p.h b/src/include/p2p.h index 426a15017a..6ffba4b0e1 100644 --- a/src/include/p2p.h +++ b/src/include/p2p.h @@ -12,7 +12,7 @@ #define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR typedef struct { - int data; // Currently only support an fd based descriptor + uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support } ncclCuDesc; typedef union { diff --git a/src/include/param.h b/src/include/param.h index c95b67c36b..963da9d175 100644 --- a/src/include/param.h +++ b/src/include/param.h @@ -12,6 +12,7 @@ const char* userHomeDir(); void setEnvFile(const char* fileName); void initEnv(); +const char *ncclGetEnv(const char *name); void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); diff --git a/src/include/proxy.h b/src/include/proxy.h index ed6c59eebc..daf3885829 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -7,10 +7,11 @@ #ifndef NCCL_PROXY_H_ #define NCCL_PROXY_H_ -#include "devcomm.h" +#include "device.h" #include "info.h" #include "socket.h" #include "ipcsocket.h" +#include "nccl_net.h" #include #include "shm.h" #include "p2p.h" @@ -65,6 +66,8 @@ struct ncclProxySubArgs { uint64_t end; void* requests[NCCL_STEPS]; void* profilingEvents[NCCL_STEPS]; + void* recvRequestsCache[NCCL_STEPS]; + int recvRequestsSubCount; }; struct ncclProxyArgs { @@ -146,7 +149,7 @@ struct ncclProxyProgressState { char opsPoolShmSuffix[6]; pthread_t thread; - bool stop; + volatile int stop; struct ncclProxyPeer** localPeers; struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS]; struct ncclProxyArgs* active; @@ -157,11 +160,12 @@ struct ncclProxyProgressState { // Expected proxy response fifo struct ncclExpectedProxyResponse { - void* opId; - int respSize; - bool done; - void* respBuff; - struct ncclExpectedProxyResponse* next; + void* opId; + int respSize; + bool done; + void* respBuff; + ncclResult_t res; + struct ncclExpectedProxyResponse* next; }; struct ncclProxyAsyncOp { @@ -181,7 +185,16 @@ struct ncclProxyLocalPeer { int asyncOpCounter; }; +// Common response header for all proxyOps +// We pack this into a struct to reduce the number of blocking send and recv calls +struct ncclProxyRpcResponseHeader { + void* opId; + ncclResult_t res; + int respSize; +}; + struct ncclProxyState { + int internalRefCount; int refCount; int tpRank; int tpnRanks; @@ -196,11 +209,13 @@ struct ncclProxyState { ncclNet_t* ncclNet; ncclCollNet_t* ncclCollNet; volatile uint32_t* abortFlag; + volatile uint32_t* abortFlagRefCount; // Service thread pthread_t thread; struct ncclSocket* listenSock; - int stop; + volatile int stop; CUcontext cudaCtx; + ncclResult_t asyncResult; // Used by main thread union ncclSocketAddress* peerAddresses; @@ -233,8 +248,11 @@ struct ncclProxyConnection { struct ncclProxyArgs *proxyAppend; struct ncclProxyArgs **proxyAppendPtr; void* transportResources; + ncclNetDeviceHandle_t* netDeviceHandle; + void* mhandles[NCCL_NUM_PROTOCOLS]; proxyConnectState state; struct ncclCollNetSharedRes* collNet; + int needsProxyProgress; }; typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); @@ -260,7 +278,7 @@ enum ncclProxyMsgType { ncclProxyMsgClose = 6, ncclProxyMsgAbort = 7, ncclProxyMsgStop = 8, - ncclProxyMsgConvertFd = 9, // cuMem API support (UDS) + ncclProxyMsgGetFd = 9, // cuMem API support (UDS) }; // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types @@ -272,9 +290,10 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize); ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId); -ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd); +ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd); ncclResult_t ncclProxyStop(struct ncclComm* comm); ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm); -ncclResult_t ncclProxyDestroy(struct ncclComm* comm); +ncclResult_t ncclProxyDestroy(struct ncclProxyState *proxyState); +ncclResult_t ncclProxyTryDetach(struct ncclProxyState *proxyState); #endif diff --git a/src/include/shm.h b/src/include/shm.h index 61b0b4d8f5..e75caa6a6e 100644 --- a/src/include/shm.h +++ b/src/include/shm.h @@ -14,4 +14,12 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de ncclResult_t ncclShmClose(ncclShmHandle_t handle); ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); +struct ncclShmemCollBuff { + volatile size_t *cnt[2]; + volatile void *ptr[2]; + int round; +}; + +ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize); + #endif diff --git a/src/include/transport.h b/src/include/transport.h index 3884a1152a..d0cd9747e2 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -7,7 +7,7 @@ #ifndef NCCL_TRANSPORT_H_ #define NCCL_TRANSPORT_H_ -#include "devcomm.h" +#include "device.h" #include "graph.h" #include "nvmlwrap.h" #include "core.h" @@ -65,6 +65,7 @@ struct ncclNvlsSharedRes { CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer char* ucBuff; // Unicast NVLS buffer address char shareableHandle[NVLS_HANDLE_SIZE]; + size_t ucGran; int nChannels; }; @@ -102,8 +103,20 @@ struct ncclTransport { ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL); +// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange +#define USE_POSIX_FD 1 + +#if USE_POSIX_FD +#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR +#else +#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE +#endif + ncclResult_t ncclNvlsInit(struct ncclComm* comm); ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent); +ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); +ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); +ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size); ncclResult_t ncclNvlsFree(struct ncclComm* comm); enum { collNetRecv=0, collNetSend=1 }; diff --git a/src/include/tuner.h b/src/include/tuner.h new file mode 100644 index 0000000000..d8b275017e --- /dev/null +++ b/src/include/tuner.h @@ -0,0 +1,22 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_INT_TUNER_H_ +#define NCCL_INT_TUNER_H_ + +#include "nccl_tuner.h" + +// Tuning plugin to override NCCL's default algorithm/protocol tuning. + +// Attempts to load NCCL tuner from environmental variable. +// Returns ncclSuccess if the correct tuner symbol has been found and +// successully loaded. Otherwise returns an error and also logs the error. +ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner); + +// Cleans up NCCL tuner plugin. +ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner); +#endif diff --git a/src/include/utils.h b/src/include/utils.h index 1c300b0cda..60f6efb5f8 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -13,6 +13,7 @@ #include #include #include +#include #include int ncclCudaCompCap(); @@ -259,11 +260,6 @@ struct ncclMemoryPool { struct Cell { Cell *next; }; - template - union CellSized { - Cell cell; - alignas(Align) char space[Size]; - }; struct Cell* head; struct Cell* tail; // meaningful only when head != nullptr }; @@ -275,14 +271,15 @@ inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) { template inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) { using Cell = ncclMemoryPool::Cell; - using CellSized = ncclMemoryPool::CellSized; Cell* cell; if (__builtin_expect(me->head != nullptr, true)) { cell = me->head; me->head = cell->next; } else { // Use the internal allocate() since it doesn't memset to 0 yet. - cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized)); + size_t cellSize = std::max(sizeof(Cell), sizeof(T)); + size_t cellAlign = std::max(alignof(Cell), alignof(T)); + cell = (Cell*)ncclMemoryStack::allocate(backing, cellSize, cellAlign); } memset(cell, 0, sizeof(T)); return reinterpret_cast(cell); @@ -349,6 +346,32 @@ inline T* ncclIntruQueueDequeue(ncclIntruQueue *me) { return ans; } +template +inline bool ncclIntruQueueDelete(ncclIntruQueue *me, T *x) { + T *prev = nullptr; + T *cur = me->head; + bool found = false; + + while (cur) { + if (cur == x) { + found = true; + break; + } + prev = cur; + cur = cur->*next; + } + + if (found) { + if (prev == nullptr) + me->head = cur->*next; + else + prev->*next = cur->*next; + if (cur == me->tail) + me->tail = prev; + } + return found; +} + template inline T* ncclIntruQueueTryDequeue(ncclIntruQueue *me) { T *ans = me->head; diff --git a/src/init.cc b/src/init.cc index 309ce10bb9..c681f2afa8 100644 --- a/src/init.cc +++ b/src/init.cc @@ -16,6 +16,7 @@ #include "enqueue.h" #include "graph.h" #include "argcheck.h" +#include "tuner.h" #include #include #include @@ -24,6 +25,7 @@ #include #include #include +#include "param.h" #define STR2(v) #v #define STR(v) STR2(v) @@ -177,7 +179,13 @@ static ncclResult_t commFree(ncclComm_t comm) { * free all intra-process communicators; therefore, we only need to focus on local * resource cleanup in commFree(). */ if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) { - pthread_join(comm->proxyState->thread, nullptr); + if (*comm->abortFlag == 0) { + /* regular thread join */ + pthread_join(comm->proxyState->thread, nullptr); + } else { + /* try to detach thread due to abort */ + ncclProxyTryDetach(comm->proxyState); + } } delete[] comm->userRedOps; @@ -211,7 +219,7 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->sharedRes->tpRankToLocalRank); NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream)); NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream)); - NCCLCHECK(ncclProxyDestroy(comm)); + NCCLCHECK(ncclProxyDestroy(comm->sharedRes->proxyState)); free(comm->sharedRes); } } @@ -229,13 +237,25 @@ static ncclResult_t commFree(ncclComm_t comm) { if (ncclAtomicRefCountDecrement(comm->abortFlagRefCount) == 0) { NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag)); - free(comm->abortFlagRefCount); + free((void*)comm->abortFlagRefCount); } free((void*)comm->config.netName); free(comm->topParentRanks); free(comm->topParentLocalRanks); + while (!ncclIntruQueueEmpty(&comm->regRecordQueue)) { + struct ncclRegRecord* rec = ncclIntruQueueDequeue(&comm->regRecordQueue); + NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize)); + free(rec->addrs); + free(rec); + } + + while (!ncclIntruQueueEmpty(&comm->regRequestQueue)) { + struct ncclRegRequest* req = ncclIntruQueueDequeue(&comm->regRequestQueue); + free(req); + } + commPoison(comm); // poison comm before free to avoid comm reuse. free(comm); @@ -275,7 +295,7 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; if (*comm->abortFlag) { - ncclGroupJobAbort(); + ncclGroupJobAbort(comm->groupJob); } else { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); if (ret != ncclSuccess) { @@ -284,6 +304,11 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) { if (ret == ncclInProgress) ret = ncclInvalidArgument; goto exit; } + /* if there is linked group job, we should complete it. */ + if (comm->groupJob) { + NCCLCHECK(ncclGroupJobComplete(comm->groupJob)); + comm->groupJob = NULL; + } } exit: @@ -338,6 +363,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan); ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp); ncclMemoryPoolConstruct(&comm->memPool_ncclPointerList); + ncclMemoryPoolConstruct(&comm->memPool_ncclNvlsHandleList); comm->groupNext = reinterpret_cast(0x1); comm->preconnectNext = reinterpret_cast(0x1); @@ -373,6 +399,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in comm->topParentRanks[i] = i; } + ncclIntruQueueConstruct(&comm->regRequestQueue); + ncclIntruQueueConstruct(&comm->regRecordQueue); ncclIntruQueueMpscConstruct(&comm->callbackQueue); return ncclSuccess; } @@ -393,6 +421,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) { tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p]; } + tmpCommAndChans.comm.p2pChunkSize = comm->p2pChunkSize; tmpCommAndChans.comm.channels = &devCommAndChans->channels[0]; comm->workFifoDepth = ncclParamWorkFifoDepth(); @@ -500,7 +529,6 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, #define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine)) #define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t)) #define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */ -#define DEFAULT_BUFFSIZE_ARM (1 << 20) /* 1MiB */ NCCL_PARAM(BuffSize, "BUFFSIZE", -2); NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2); NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2); @@ -516,8 +544,6 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() }; int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE }; - if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM; - for (int p=0; pbuffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p]; } @@ -525,6 +551,10 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize(); else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize(); else comm->p2pChunkSize = ncclParamP2pPciChunkSize(); + + // Make sure P2P chunksize is not larger than coll chunksize. + if (comm->p2pChunkSize * NCCL_STEPS > comm->buffSizes[NCCL_PROTO_SIMPLE]) comm->p2pChunkSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; + if (comm->sharedRes->owner != comm) { /* make sure split comm p2pChunkSize won't exceed shared p2pChunkSize. */ comm->p2pChunkSize = std::min(comm->p2pChunkSize, comm->sharedRes->tpP2pChunkSize); @@ -606,7 +636,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n if (share) { if (myinfo->isMaster) { comm->collNetSharedRes = parent->collNetSharedRes; - comm->collNetChannels = std::min(std::max(comm->nChannels, comm->nvlsChannels), parent->collNetSharedRes->nChannels); + comm->collNetChannels = std::min(comm->nChannels, parent->collNetSharedRes->nChannels); for (int c = 0; c < comm->collNetChannels; ++c) NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail); } @@ -625,8 +655,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n } else { /* this allocated buffer will be freed on proxy side */ NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1)); - /* TODO: min or max? */ - comm->collNetChannels = comm->collNetSharedRes->nChannels = std::max(comm->nChannels, comm->nvlsChannels); + comm->collNetChannels = comm->collNetSharedRes->nChannels = comm->nChannels; comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE]; for (int c = 0; c < comm->collNetChannels; c++) { struct ncclChannel* channel = comm->channels + c; @@ -804,6 +833,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0; for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[rank].cudaCompCap); for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[rank].cudaCompCap); + + comm->nvlsRegSupport = 1; for (int i = 0; i < nranks; i++) { if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) { @@ -816,6 +847,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p comm->intraNext = comm->peerInfo[i].comm; } } + + if (comm->nvlsRegSupport) { + for (int j = i + 1; j < nranks; j++) { + if (comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash && + comm->peerInfo[i].pidHash == comm->peerInfo[j].pidHash) { + comm->nvlsRegSupport = 0; + break; + } + } + } } TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0); @@ -859,7 +900,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p // Determine local CollNet support if (collNetSupport(comm)) { - char *collNetEnable = getenv("NCCL_COLLNET_ENABLE"); + const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE"); if (collNetEnable != NULL) { INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable); if (strcmp(collNetEnable, "1") == 0) { @@ -872,22 +913,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p NCCLCHECK(ncclNvlsInit(comm)); // Get rings and trees + memset(&ringGraph, 0, sizeof(struct ncclTopoGraph)); ringGraph.id = 0; ringGraph.pattern = NCCL_TOPO_PATTERN_RING; - ringGraph.collNet = 0; ringGraph.minChannels = 1; ringGraph.maxChannels = MAXCHANNELS/2; NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &ringGraph), ret, fail); + memset(&treeGraph, 0, sizeof(struct ncclTopoGraph)); treeGraph.id = 1; treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE; - treeGraph.collNet = 0; treeGraph.minChannels = ringGraph.nChannels; treeGraph.maxChannels = ringGraph.nChannels; NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &treeGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &treeGraph), ret, fail); + memset(&collNetGraph, 0, sizeof(struct ncclTopoGraph)); collNetGraph.id = 2; collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; collNetGraph.collNet = 1; @@ -895,20 +937,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p if (comm->collNetSupport) { NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail); - } else { - collNetGraph.nChannels = 0; } + memset(&nvlsGraph, 0, sizeof(struct ncclTopoGraph)); nvlsGraph.id = 3; nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS; - nvlsGraph.collNet = 0; nvlsGraph.minChannels = 1; nvlsGraph.maxChannels = MAXCHANNELS; if (comm->nvlsSupport) { NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail); - } else { - nvlsGraph.nChannels = 0; } // Initialize num P2P LL buffers for this communicator @@ -1136,7 +1174,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p int sendNode = (node+delta)%nNodes; for (int step=0; step < steps; step++) { int recvIndex = (localRank-step+steps)%steps; - int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1; + int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1; tasks->p2pRecvOrder[i] = recvRank; int sendIndex = (localRank+step)%steps; int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1; @@ -1197,7 +1235,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } if (comm->intraRank == 0) { // Load ncclParamLaunchMode - char* str = getenv("NCCL_LAUNCH_MODE"); + const char* str = ncclGetEnv("NCCL_LAUNCH_MODE"); enum ncclLaunchMode mode, modeOld; if (str && strcasecmp(str, "GROUP") == 0) { mode = ncclLaunchModeGroup; @@ -1357,6 +1395,11 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail); + NCCLCHECKGOTO(ncclLoadTunerPlugin(&comm->tuner), res, fail); + if (comm->tuner) { + NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog)); + } + // update communicator state comm->initState = ncclSuccess; @@ -1425,7 +1468,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { comm->config.maxCTAs = maxCTAsEnv; } - envNetName = getenv("NCCL_NET"); + envNetName = ncclGetEnv("NCCL_NET"); if (envNetName) tmpNetName = envNetName; if (tmpNetName != NULL) { @@ -1560,7 +1603,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni ncclResult_t res = ncclSuccess; ncclComm_t comm = NULL; struct ncclCommInitRankAsyncJob *job = NULL; - char* env = getenv("NCCL_COMM_ID"); + const char* env = ncclGetEnv("NCCL_COMM_ID"); if (env && myrank == 0) { INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env); NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&commId, true), res, fail); @@ -1602,7 +1645,7 @@ exit: fail: if (comm) { if (comm->abortFlag) ncclCudaHostFree((void *)comm->abortFlag); - if (comm->abortFlagRefCount) free(comm->abortFlagRefCount); + if (comm->abortFlagRefCount) free((void*)comm->abortFlagRefCount); free(comm); } if (newcomm) *newcomm = NULL; @@ -1777,6 +1820,11 @@ static ncclResult_t commCleanup(ncclComm_t comm) { CUDACHECK(cudaSetDevice(commDevice)); } + if (comm->tuner != NULL) { + NCCLCHECK(comm->tuner->destroy()); + NCCLCHECK(ncclCloseTunerPlugin(&comm->tuner)); + } + NCCLCHECK(commFree(comm)); if (savedDevice != commDevice) { @@ -1991,6 +2039,7 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc NCCLCHECK(ncclGroupStartInternal()); NCCLCHECKGOTO(PtrCheck(comm, "CommSplit", "comm"), res, fail); NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail); + NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail); /* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */ *newcomm = NCCL_COMM_NULL; @@ -2037,7 +2086,7 @@ fail: if (childComm) { if (comm && !comm->config.splitShare) { if (childComm->abortFlag) ncclCudaHostFree((void*)childComm->abortFlag); - if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount); + if (childComm->abortFlagRefCount) free((void*)childComm->abortFlagRefCount); } free(childComm); } @@ -2074,6 +2123,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError")); *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE); + if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE); return ncclSuccess; } @@ -2116,3 +2166,208 @@ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { *rank = comm->rank; return ncclSuccess; } + +NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1); + +NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle); +ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) { + NVTX3_FUNC_RANGE_IN(nccl_domain); + ncclResult_t ret = ncclSuccess; + +#if CUDART_VERSION >= 12010 + size_t granularity; + if (ncclParamLocalRegister()) { + if (comm == NCCL_COMM_NULL || buff == NULL || handle == NULL || size == 0) { + WARN("Invalid arguments comm %p, buff %p, size %ld, handle %p", comm, buff, size, handle); + ret = ncclInvalidArgument; + } else if (comm->nvlsSupport) { + CUmulticastObjectProp prop = comm->nvlsResources->properties; + + prop.size = size; + CUCHECK(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + + if ((uintptr_t)buff % comm->nvlsResources->ucGran == 0 && size % granularity == 0) { + /* we can direct register what user provide */ + struct ncclRegRequest* req; + NCCLCHECK(ncclCalloc(&req, 1)); + req->buff = (uintptr_t)buff; + req->size = size; + ncclIntruQueueEnqueue(&comm->regRequestQueue, req); + *handle = (void*)req; + } else { + void* base; + size_t baseSize; + /* Since we don't provide actually allocated buffer size for users by ncclMemAlloc, + * therefore, we need to get the full range of the buffer by cuMemGetAddressRange to + * register buffers. */ + CUCHECK(cuMemGetAddressRange((CUdeviceptr*)&base, &baseSize, (CUdeviceptr)buff)); + if ((uintptr_t)base % comm->nvlsResources->ucGran == 0 && baseSize % granularity == 0) { + struct ncclRegRequest* req; + NCCLCHECK(ncclCalloc(&req, 1)); + req->buff = (uintptr_t)base; + req->size = baseSize; + ncclIntruQueueEnqueue(&comm->regRequestQueue, req); + *handle = (void*)req; + } else { + WARN("register fails, buffer %p (aligned %s, granularity %ld) and size %ld (aligned %s, granularity %ld) for registration", buff, (uintptr_t)buff % comm->nvlsResources->ucGran == 0 ? "TRUE" : "FALSE", comm->nvlsResources->ucGran, size, size % granularity == 0 ? "TRUE" : "FALSE", granularity); + ret = ncclInvalidArgument; + } + } + } + } +#endif + + return ret; +} + +NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle); +ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) { + ncclResult_t ret = ncclSuccess; + +#if CUDART_VERSION >= 12010 + struct ncclRegRequest* dreq = (struct ncclRegRequest*)handle; + if (ncclParamLocalRegister()) { + if (comm == NCCL_COMM_NULL || handle == NULL) { + WARN("Invalid arguments comm %p, handle %p", comm, handle); + ret = ncclInvalidArgument; + } else { + struct ncclRegRecord* rec; + + /* first release register record */ + rec = ncclIntruQueueHead(&comm->regRecordQueue); + + while (rec) { + if (rec->buff == dreq->buff && rec->size == dreq->size) { + NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize)); + ncclIntruQueueDelete(&comm->regRecordQueue, rec); + free(rec->addrs); + free(rec); + break; + } + rec = rec->next; + } + + /* then free register request */ + if (ncclIntruQueueDelete(&comm->regRequestQueue, dreq) == false) { + WARN("Invalid handle %p", handle); + ret = ncclInvalidArgument; + } + } + } +#endif + + return ret; +} + +NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size); +ncclResult_t ncclMemAlloc(void **ptr, size_t size) { + NVTX3_FUNC_RANGE_IN(nccl_domain); + ncclResult_t ret = ncclSuccess; + +#if CUDART_VERSION >= 12010 + size_t memGran = 0; + size_t mcGran = 0; + CUdevice currentDev; + CUmemAllocationProp memprop = {}; + CUmulticastObjectProp mcprop = {}; + CUmemAccessDesc accessDesc = {}; + CUmemGenericAllocationHandle handle; + int cudaDev; + int flag = 0; + int dcnt; + int mcSupport = 0; + + if (ptr == NULL || size == 0) goto fallback; + + if (ncclCudaLibraryInit() != ncclSuccess) goto fallback; + + CUDACHECK(cudaGetDevice(&cudaDev)); + CUCHECK(cuDeviceGet(¤tDev, cudaDev)); + if (CUPFN(cuMulticastCreate) != NULL) + CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev)); + + if (mcSupport) { + memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + memprop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE; + memprop.location.id = currentDev; + // Query device to see if RDMA support is available + CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev)); + if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1; + CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); + + /* mc property */ + CUDACHECK(cudaGetDeviceCount(&dcnt)); + mcprop.size = size; + /* device cnt is a dummy value right now, it might affect mc granularity in the future. */ + mcprop.numDevices = dcnt; + mcprop.handleTypes = NVLS_CU_MEM_HANDLE_TYPE; + mcprop.flags = 0; + CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + + /* only size needs to be aligned to mcGran */ + ALIGN_SIZE(size, mcGran); + /* Allocate the physical memory on the device */ + CUCHECK(cuMemCreate(&handle, size, &memprop, 0)); + /* Reserve a virtual address range */ + CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0)); + /* Map the virtual address range to the physical allocation */ + CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + /* Now allow RW access to the newly mapped memory */ + for (int i = 0; i < dcnt; ++i) { + int p2p = 0; + if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, cudaDev, i) == cudaSuccess) && p2p)) { + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = i; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); + } + } + goto exit; + } + +fallback: +#endif + CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail); + +exit: + return ret; +fail: + goto exit; +} + +NCCL_API(ncclResult_t, ncclMemFree, void *ptr); +ncclResult_t ncclMemFree(void *ptr) { + NVTX3_FUNC_RANGE_IN(nccl_domain); + ncclResult_t ret = ncclSuccess; + int saveDevice; + + CUDACHECK(cudaGetDevice(&saveDevice)); +#if CUDART_VERSION >= 12010 + CUdevice ptrDev = 0; + int mcSupport = 0; + + if (ptr == NULL) goto fallback; + + if (ncclCudaLibraryInit() != ncclSuccess) goto fallback; + + CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail); + if (CUPFN(cuMulticastCreate) != NULL) + CUCHECKGOTO(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, ptrDev), ret, fail); + + CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail); + if (mcSupport) { + NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail); + goto exit; + } + +fallback: +#endif + CUDACHECKGOTO(cudaFree(ptr), ret, fail); + +exit: + cudaSetDevice(saveDevice); + return ret; +fail: + goto exit; +} diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc index 334ee10f69..f2260a1c0e 100644 --- a/src/misc/cudawrap.cc +++ b/src/misc/cudawrap.cc @@ -12,7 +12,7 @@ #include // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage -NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", 0); +NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2); static int ncclCuMemSupported = 0; @@ -43,7 +43,9 @@ error: } int ncclCuMemEnable() { - return ((ncclParamCuMemEnable() == -2 && ncclCuMemSupported) || ncclParamCuMemEnable()); + // NCCL_CUMEM_ENABLE=-2 means auto-detect CUMEM support + int param = ncclParamCuMemEnable(); + return param >= 0 ? param : (param == -2 && ncclCuMemSupported); } #define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr @@ -74,6 +76,8 @@ DECLARE_CUDA_PFN(cuMemRelease, 10020); DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000); DECLARE_CUDA_PFN(cuMemSetAccess, 10020); DECLARE_CUDA_PFN(cuMemUnmap, 10020); +/* ncclMemAlloc/Free */ +DECLARE_CUDA_PFN(cuPointerGetAttribute, 4000); #if CUDA_VERSION >= 11070 /* transport/collNet.cc/net.cc*/ DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support @@ -137,6 +141,8 @@ static ncclResult_t cudaPfnFuncLoader(void) { LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1); LOAD_SYM(cuMemSetAccess, 10020, 1); LOAD_SYM(cuMemUnmap, 10020, 1); +/* ncclMemAlloc/Free */ + LOAD_SYM(cuPointerGetAttribute, 4000, 1); #if CUDA_VERSION >= 11070 LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support #endif @@ -158,7 +164,7 @@ static ncclResult_t initResult; static void initOnceFunc() { do { - char* val = getenv("CUDA_LAUNCH_BLOCKING"); + const char* val = ncclGetEnv("CUDA_LAUNCH_BLOCKING"); ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0); } while (0); @@ -167,7 +173,7 @@ static void initOnceFunc() { * Load CUDA driver library */ char path[1024]; - char *ncclCudaPath = getenv("NCCL_CUDA_PATH"); + const char *ncclCudaPath = ncclGetEnv("NCCL_CUDA_PATH"); if (ncclCudaPath == NULL) snprintf(path, 1024, "%s", "libcuda.so"); else diff --git a/src/misc/ibvsymbols.cc b/src/misc/ibvsymbols.cc index c41a457c8f..bd5f33390f 100644 --- a/src/misc/ibvsymbols.cc +++ b/src/misc/ibvsymbols.cc @@ -50,6 +50,9 @@ ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) { ASSIGN_SYM(ibvSymbols, ibv_destroy_qp, ibv_internal_destroy_qp); ASSIGN_SYM(ibvSymbols, ibv_fork_init, ibv_internal_fork_init); ASSIGN_SYM(ibvSymbols, ibv_event_type_str, ibv_internal_event_type_str); + + ASSIGN_SYM(ibvSymbols, ibv_query_ece, ibv_internal_query_ece); + ASSIGN_SYM(ibvSymbols, ibv_set_ece, ibv_internal_set_ece); ibvSymbols->ibv_internal_reg_mr = &ibv_internal_reg_mr; ibvSymbols->ibv_internal_query_port = &ibv_internal_query_port; @@ -123,6 +126,9 @@ ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) { LOAD_SYM(ibvhandle, "ibv_fork_init", ibvSymbols->ibv_internal_fork_init); LOAD_SYM(ibvhandle, "ibv_event_type_str", ibvSymbols->ibv_internal_event_type_str); + LOAD_SYM_VERSION(ibvhandle, "ibv_query_ece", ibvSymbols->ibv_internal_query_ece, "IBVERBS_1.10"); + LOAD_SYM_VERSION(ibvhandle, "ibv_set_ece", ibvSymbols->ibv_internal_set_ece, "IBVERBS_1.10"); + return ncclSuccess; teardown: @@ -150,6 +156,8 @@ teardown: ibvSymbols->ibv_internal_destroy_qp = NULL; ibvSymbols->ibv_internal_fork_init = NULL; ibvSymbols->ibv_internal_event_type_str = NULL; + ibvSymbols->ibv_internal_query_ece = NULL; + ibvSymbols->ibv_internal_set_ece = NULL; if (ibvhandle != NULL) dlclose(ibvhandle); return ncclSystemError; diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc index bc896e10eb..eb4e52b606 100644 --- a/src/misc/ibvwrap.cc +++ b/src/misc/ibvwrap.cc @@ -45,11 +45,30 @@ ncclResult_t wrap_ibv_symbols(void) { } \ return ncclSuccess; +#define IBV_INT_CHECK_RET_ERRNO_OPTIONAL(container, internal_name, call, success_retval, name, supported) \ + if (container.internal_name == NULL) { \ + INFO(NCCL_NET, "Call to " name " skipped, internal_name doesn't exist"); \ + *supported = 0; \ + return ncclSuccess; \ + } \ + int ret = container.call; \ + if (ret == ENOTSUP || ret == EOPNOTSUPP) { \ + INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \ + *supported = 0; \ + return ncclSuccess; \ + } else if (ret != success_retval) { \ + WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \ + *supported = 1; \ + return ncclSystemError; \ + } \ + *supported = 1; \ + return ncclSuccess; + #define IBV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \ CHECK_NOT_NULL(container, internal_name); \ int ret = container.call; \ if (ret != success_retval) { \ - WARN("Call to " name " failed with error %s", strerror(ret)); \ + WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \ return ncclSystemError; \ } \ return ncclSuccess; @@ -187,6 +206,14 @@ ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp"); } +ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ + IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibvSymbols, ibv_internal_query_ece, ibv_internal_query_ece(qp, ece), 0, "ibv_query_ece", supported); +} + +ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ + IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibvSymbols, ibv_internal_set_ece, ibv_internal_set_ece(qp, ece), 0, "ibv_set_ece", supported); +} + ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) { *ret = (char *) ibvSymbols.ibv_internal_event_type_str(event); return ncclSuccess; diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc index b2dee4852d..9d66ac7197 100644 --- a/src/misc/ipcsocket.cc +++ b/src/misc/ipcsocket.cc @@ -30,7 +30,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v handle->fd = -1; handle->socketName[0] = '\0'; if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) { - WARN("UDS: Socket creation error : %d", errno); + WARN("UDS: Socket creation error : %s (%d)", strerror(errno), errno); return ncclSystemError; } @@ -54,7 +54,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick #endif if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) { - WARN("UDS: Binding to socket %s failed : %d", temp, errno); + WARN("UDS: Binding to socket %s failed : %s (%d)", temp, strerror(errno), errno); close(fd); return ncclSystemError; } @@ -73,6 +73,15 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v return ncclSuccess; } +ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd) { + if (handle == NULL) { + WARN("ncclSocketGetFd: pass NULL socket"); + return ncclInvalidArgument; + } + if (fd) *fd = handle->fd; + return ncclSuccess; +} + ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) { if (handle == NULL) { return ncclInternalError; @@ -90,7 +99,7 @@ ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) { return ncclSuccess; } -ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) { +ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd) { struct msghdr msg = {0, 0, 0, 0, 0, 0, 0}; struct iovec iov[1]; @@ -107,8 +116,13 @@ ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) { msg.msg_control = control_un.control; msg.msg_controllen = sizeof(control_un.control); - iov[0].iov_base = (void *)dummy_buffer; - iov[0].iov_len = sizeof(dummy_buffer); + if (hdr == NULL) { + iov[0].iov_base = (void *)dummy_buffer; + iov[0].iov_len = sizeof(dummy_buffer); + } else { + iov[0].iov_base = hdr; + iov[0].iov_len = hdrLen; + } msg.msg_iov = iov; msg.msg_iovlen = 1; @@ -121,25 +135,30 @@ ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) { if (handle->abortFlag && *handle->abortFlag) return ncclInternalError; } - if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) { - if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) { - WARN("UDS: Receiving data over socket failed"); + if (recvFd != NULL) { + if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) { + if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) { + WARN("UDS: Receiving data over socket failed"); + return ncclSystemError; + } + + memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd)); + } else { + WARN("UDS: Receiving data over socket %s failed", handle->socketName); return ncclSystemError; } - - memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd)); - } else { - WARN("UDS: Receiving data over socket %s failed", handle->socketName); - return ncclSystemError; + TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName); } - TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName); - return ncclSuccess; } -ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) { - struct msghdr msg; +ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) { + return ncclIpcSocketRecvMsg(handle, NULL, 0, recvFd); +} + +ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash) { + struct msghdr msg = {0, 0, 0, 0, 0, 0, 0}; struct iovec iov[1]; char temp[NCCL_IPC_SOCKNAME_LEN]; @@ -149,6 +168,7 @@ ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int ra } control_un; struct cmsghdr *cmptr; + char dummy_buffer[1]; struct sockaddr_un cliaddr; // Construct client address to send this shareable handle to @@ -162,35 +182,43 @@ ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int ra } (void) strncpy(cliaddr.sun_path, temp, len); - TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp); - #ifdef USE_ABSTRACT_SOCKET cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick #endif - msg.msg_control = control_un.control; - msg.msg_controllen = sizeof(control_un.control); + TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp); - cmptr = CMSG_FIRSTHDR(&msg); - cmptr->cmsg_len = CMSG_LEN(sizeof(int)); - cmptr->cmsg_level = SOL_SOCKET; - cmptr->cmsg_type = SCM_RIGHTS; + if (sendFd != -1) { + TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp); - memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd)); + msg.msg_control = control_un.control; + msg.msg_controllen = sizeof(control_un.control); + + cmptr = CMSG_FIRSTHDR(&msg); + cmptr->cmsg_len = CMSG_LEN(sizeof(int)); + cmptr->cmsg_level = SOL_SOCKET; + cmptr->cmsg_type = SCM_RIGHTS; + memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd)); + } msg.msg_name = (void *)&cliaddr; msg.msg_namelen = sizeof(struct sockaddr_un); - iov[0].iov_base = (void *)""; - iov[0].iov_len = 1; + if (hdr == NULL) { + iov[0].iov_base = (void *)dummy_buffer; + iov[0].iov_len = sizeof(dummy_buffer); + } else { + iov[0].iov_base = hdr; + iov[0].iov_len = hdrLen; + } msg.msg_iov = iov; msg.msg_iovlen = 1; msg.msg_flags = 0; ssize_t sendResult; - while ((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) { + while ((sendResult = sendmsg(handle->fd, &msg, 0)) < 0) { if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { - WARN("UDS: Sending data over socket %s failed : %d", temp, errno); + WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno); return ncclSystemError; } if (handle->abortFlag && *handle->abortFlag) return ncclInternalError; @@ -198,3 +226,7 @@ ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int ra return ncclSuccess; } + +ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) { + return ncclIpcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash); +} diff --git a/src/misc/param.cc b/src/misc/param.cc index bf7aa00871..e0b6ab821b 100644 --- a/src/misc/param.cc +++ b/src/misc/param.cc @@ -63,7 +63,7 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&mutex); if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) { - char* str = getenv(env); + const char* str = ncclGetEnv(env); int64_t value = deftVal; if (str && strlen(str) > 0) { errno = 0; @@ -79,3 +79,9 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6 } pthread_mutex_unlock(&mutex); } + +const char *ncclGetEnv(const char *name) { + static pthread_once_t once = PTHREAD_ONCE_INIT; + pthread_once(&once, initEnv); + return getenv(name); +} \ No newline at end of file diff --git a/src/misc/profiler.cc b/src/misc/profiler.cc index 145b18fe8c..785d616b8b 100644 --- a/src/misc/profiler.cc +++ b/src/misc/profiler.cc @@ -61,7 +61,7 @@ void ncclProfilingDump() { static int dumpDone = 0; if (dumpDone) return; dumpDone = 1; - const char* str = getenv("NCCL_PROXY_PROFILE"); + const char* str = ncclGetEnv("NCCL_PROXY_PROFILE"); if (!str) { free(profilingEvents); return; } FILE* f = fopen(str, "w"); fprintf(f, "[\n"); diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc index ce05c3ef3e..80ece40c1c 100644 --- a/src/misc/shmutils.cc +++ b/src/misc/shmutils.cc @@ -5,6 +5,7 @@ ************************************************************************/ #include "shm.h" +#include "comm.h" #include "checks.h" #include #include @@ -67,7 +68,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail); } - if (ftruncate(fd, realShmSize) != 0) { + if (fallocate(fd, 0, 0, realShmSize) != 0) { WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize); ret = ncclSystemError; goto fail; @@ -162,3 +163,37 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) { } return ret; } + +ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize) { + ncclResult_t ret = ncclSuccess; + int curRound = shmem->round; + size_t mycnt; + + if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL) { + ret = ncclInvalidArgument; + goto exit; + } + + memcpy((char*)shmem->ptr[curRound] + comm->localRank * typeSize, sendbuff, typeSize); + /* sync among local ranks */ + mycnt = __atomic_add_fetch(shmem->cnt[curRound], 1, __ATOMIC_ACQ_REL); + if (mycnt == comm->localRanks) { + *shmem->cnt[curRound ^ 1] = 0; /* prepare next round */ + __atomic_store_n(shmem->cnt[curRound], comm->localRanks + 1, __ATOMIC_RELEASE); /* release everyone */ + } else { + uint64_t t0 = clockNano(); + while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) { + if (clockNano() - t0 >= 5 * 1000) sched_yield(); + if (*comm->abortFlag == 1) { + ret = ncclInternalError; + goto exit; + } + } + } + + memcpy(recvbuff, (const void*)shmem->ptr[curRound], comm->localRanks * typeSize); + shmem->round ^= 1; + +exit: + return ret; +} diff --git a/src/misc/socket.cc b/src/misc/socket.cc index 5700d83cdc..149bd73aa1 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -11,6 +11,7 @@ #include #include #include +#include "param.h" static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) { int bytes = 0; @@ -84,7 +85,7 @@ static uint16_t socketToPort(union ncclSocketAddress *addr) { /* Allow the user to force the IPv4/IPv6 interface selection */ static int envSocketFamily(void) { int family = -1; // Family selection is not forced, will use first one found - char* env = getenv("NCCL_SOCKET_FAMILY"); + const char* env = ncclGetEnv("NCCL_SOCKET_FAMILY"); if (env == NULL) return family; @@ -325,7 +326,7 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa // Allow user to force the INET socket family selection int sock_family = envSocketFamily(); // User specified interface - char* env = getenv("NCCL_SOCKET_IFNAME"); + const char* env = ncclGetEnv("NCCL_SOCKET_IFNAME"); if (env && strlen(env) > 1) { INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env); // Specified by user : find or fail @@ -337,10 +338,10 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); // else see if we can get some hint from COMM ID if (nIfs == 0) { - char* commId = getenv("NCCL_COMM_ID"); + const char* commId = ncclGetEnv("NCCL_COMM_ID"); if (commId && strlen(commId) > 1) { - INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId); - // Try to find interface that is in the same subnet as the IP in comm id + INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId); + // Try to find interface that is in the same subnet as the IP in comm id union ncclSocketAddress idAddr; ncclSocketGetAddrFromString(&idAddr, commId); nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs); diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc new file mode 100644 index 0000000000..bfe61e8c1d --- /dev/null +++ b/src/misc/tuner.cc @@ -0,0 +1,82 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include +#include + +#include "debug.h" +#include "nccl_tuner.h" + +pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER; +static int tunerPluginRefCount = -1; +static void* tunerPluginLib = nullptr; +ncclTuner_t* tunerSymbol = nullptr; + +ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner) { + // Initialize to nullptr by default if plugin tuner cannot be loaded. + *tuner = nullptr; + if (tunerPluginRefCount == -2) return ncclSuccess; + + pthread_mutex_lock(&tunerPluginLock); + if (tunerPluginRefCount == -1) { + tunerPluginRefCount = -2; // Default: no plugin, don't try again later + + const char* name = getenv("NCCL_TUNER_PLUGIN"); + if (name) { + INFO(NCCL_TUNING, "NCCL_TUNER_PLUGIN set to %s", name); + tunerPluginLib = dlopen(name, RTLD_LAZY | RTLD_LOCAL); + } + if (tunerPluginLib == nullptr) { + // dlopen does not guarantee to set errno, but dlerror only gives us a + // string, so checking errno doesn't hurt to try to provide a better + // error message + if (errno == ENOENT) { + INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name); + } else { + INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror()); + } + } else { + tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL); + if (tunerSymbol == nullptr) { + INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name); + dlclose(tunerPluginLib); + tunerPluginLib = nullptr; + } else { + INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name); + tunerPluginRefCount = 0; + } + } + } + + if (tunerPluginRefCount >= 0) { + *tuner = tunerSymbol; + INFO(NCCL_INIT, "Using tuner plugin: '%s'", tunerSymbol->name); + tunerPluginRefCount++; + } + pthread_mutex_unlock(&tunerPluginLock); + return ncclSuccess; +} + +ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner) { + if (*tuner == nullptr) return ncclSuccess; + pthread_mutex_lock(&tunerPluginLock); + if (--tunerPluginRefCount == 0) { + if (tunerPluginLib == nullptr) { + WARN("Tuner plugin refcount is 0, yet tunerPluginLib ptr is NULL\n"); + } else { + INFO(NCCL_TUNING, "Closing tuner: '%s'", tunerSymbol->name); + dlclose(tunerPluginLib); + } + tunerPluginLib = nullptr; + tunerSymbol = nullptr; + *tuner = nullptr; + tunerPluginRefCount = -1; + } + pthread_mutex_unlock(&tunerPluginLock); + return ncclSuccess; +} diff --git a/src/misc/utils.cc b/src/misc/utils.cc index 20e8e41a60..b775666799 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -85,13 +85,13 @@ uint64_t getHash(const char* string, int n) { #define HOSTID_FILE "/proc/sys/kernel/random/boot_id" uint64_t getHostHash(void) { char hostHash[1024]; - char *hostId; + const char *hostId; // Fall back is the full hostname if something fails (void) getHostName(hostHash, sizeof(hostHash), '\0'); int offset = strlen(hostHash); - if ((hostId = getenv("NCCL_HOSTID")) != NULL) { + if ((hostId = ncclGetEnv("NCCL_HOSTID")) != NULL) { INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId); strncpy(hostHash, hostId, sizeof(hostHash)); } else { diff --git a/src/nccl.h.in b/src/nccl.h.in index 0b613eef93..1585d58acb 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -78,6 +78,15 @@ typedef struct ncclConfig_v21700 { NCCL_CONFIG_UNDEF_INT /* splitShare */ \ } +/* NCCL malloc and free function for all types of NCCL optimizations + * (e.g. user buffer registration). The actual allocated size might + * be larger than requested due to granularity requirement. */ +ncclResult_t ncclMemAlloc(void** ptr, size_t size); +ncclResult_t pncclMemAlloc(void** ptr, size_t size); + +ncclResult_t ncclMemFree(void *ptr); +ncclResult_t pncclMemFree(void *ptr); + /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. * This integer is coded with the MAJOR, MINOR and PATCH level of the * NCCL library @@ -417,6 +426,14 @@ ncclResult_t pncclGroupStart(); ncclResult_t ncclGroupEnd(); ncclResult_t pncclGroupEnd(); +/* Register CUDA buffer for zero-copy operation */ +ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); +ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); + +/* Deregister CUDA buffer */ +ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle); +ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle); + #ifdef __cplusplus } // end extern "C" #endif diff --git a/src/net.cc b/src/net.cc index 3077f8806f..2bfc9a9277 100644 --- a/src/net.cc +++ b/src/net.cc @@ -1,3 +1,9 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + #include "net.h" #include "bootstrap.h" #include "checks.h" @@ -9,148 +15,190 @@ //#include //#include -static ncclNet_v6_t ncclNet_v4_as_v6; -static ncclNet_v6_t ncclNet_v5_as_v6; -static ncclNet_v4_t *ncclNet_v4; +static ncclNet_v7_t ncclNet_v5_as_v7; +static ncclNet_v7_t ncclNet_v6_as_v7; static ncclNet_v5_t *ncclNet_v5; -static ncclCollNet_v6_t ncclCollNet_v4_as_v6; -static ncclCollNet_v6_t ncclCollNet_v5_as_v6; -static ncclCollNet_v4_t *ncclCollNet_v4; +static ncclNet_v6_t *ncclNet_v6; +static ncclCollNet_v7_t ncclCollNet_v5_as_v7; +static ncclCollNet_v7_t ncclCollNet_v6_as_v7; static ncclCollNet_v5_t *ncclCollNet_v5; +static ncclCollNet_v6_t *ncclCollNet_v6; -static ncclResult_t ncclNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) { - ncclNetProperties_v4_t p4; - ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4); +static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) { + ncclNetProperties_v6_t p6; + ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6); if (ans != ncclSuccess) return ans; - props->name = p4.name; - props->pciPath = p4.pciPath; - props->guid = p4.guid; - props->ptrSupport = p4.ptrSupport; - props->speed = p4.speed; - props->port = p4.port; - props->maxComms = p4.maxComms; - props->maxRecvs = 1; - props->latency = 0; + props->name = p6.name; + props->pciPath = p6.pciPath; + props->guid = p6.guid; + props->ptrSupport = p6.ptrSupport; + props->speed = p6.speed; + props->port = p6.port; + props->maxComms = p6.maxComms; + props->maxRecvs = p6.maxRecvs; + props->latency = p6.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclSuccess; } -static ncclResult_t ncclNet_v4_as_v6_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { - return ncclNet_v4->isend(sendComm, data, size, mhandle, request); +static ncclResult_t ncclNet_v6_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { + return ncclNet_v6->connect(dev, handle, sendComm); } -static ncclResult_t ncclNet_v4_as_v6_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { - if (n == 0) return ncclSuccess; - if (n != 1) return ncclInvalidArgument; - return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request); +static ncclResult_t ncclNet_v6_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { + return ncclNet_v6->accept(listenComm, recvComm); } -static ncclResult_t ncclNet_v4_as_v6_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { - if (n == 0) return ncclSuccess; - if (n != 1) return ncclInvalidArgument; - return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request); -} - -// We use a wrapper around the v4 init to copy over the struct contents -// post-init since they may not be initialized before hand. -static ncclResult_t ncclNet_v4_as_v6_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclNet_v4->init(logfn)); - ncclNet_v4_as_v6.name = ncclNet_v4->name; - ncclNet_v4_as_v6.devices = ncclNet_v4->devices; - ncclNet_v4_as_v6.getProperties = ncclNet_v4_as_v6_getProperties; - ncclNet_v4_as_v6.listen = ncclNet_v4->listen; - ncclNet_v4_as_v6.connect = ncclNet_v4->connect; - ncclNet_v4_as_v6.accept = ncclNet_v4->accept; - ncclNet_v4_as_v6.regMr = ncclNet_v4->regMr; - ncclNet_v4_as_v6.regMrDmaBuf = NULL; - ncclNet_v4_as_v6.deregMr = ncclNet_v4->deregMr; - ncclNet_v4_as_v6.isend = ncclNet_v4_as_v6_isend; - ncclNet_v4_as_v6.irecv = ncclNet_v4_as_v6_irecv; - ncclNet_v4_as_v6.iflush = ncclNet_v4_as_v6_iflush; - ncclNet_v4_as_v6.test = ncclNet_v4->test; - ncclNet_v4_as_v6.closeSend = ncclNet_v4->closeSend; - ncclNet_v4_as_v6.closeRecv = ncclNet_v4->closeRecv; - ncclNet_v4_as_v6.closeListen = ncclNet_v4->closeListen; +static ncclResult_t ncclNet_v6_as_v7_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclNet_v6->init(logfn)); + ncclNet_v6_as_v7.name = ncclNet_v6->name; + ncclNet_v6_as_v7.devices = ncclNet_v6->devices; + ncclNet_v6_as_v7.getProperties = ncclNet_v6_as_v7_getProperties; // ncclNet_v5->getProperties; + ncclNet_v6_as_v7.listen = ncclNet_v6->listen; + ncclNet_v6_as_v7.connect = ncclNet_v6_as_v7_connect; + ncclNet_v6_as_v7.accept = ncclNet_v6_as_v7_accept; + ncclNet_v6_as_v7.regMr = ncclNet_v6->regMr; + ncclNet_v6_as_v7.regMrDmaBuf = ncclNet_v6->regMrDmaBuf; + ncclNet_v6_as_v7.deregMr = ncclNet_v6->deregMr; + ncclNet_v6_as_v7.isend = ncclNet_v6->isend; + ncclNet_v6_as_v7.irecv = ncclNet_v6->irecv; + ncclNet_v6_as_v7.iflush = ncclNet_v6->iflush; + ncclNet_v6_as_v7.test = ncclNet_v6->test; + ncclNet_v6_as_v7.closeSend = ncclNet_v6->closeSend; + ncclNet_v6_as_v7.closeRecv = ncclNet_v6->closeRecv; + ncclNet_v6_as_v7.closeListen = ncclNet_v6->closeListen; + ncclNet_v6_as_v7.getDeviceMr = NULL; + ncclNet_v6_as_v7.irecvConsumed = NULL; return ncclSuccess; } +static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) { + ncclNetProperties_v6_t p6; + ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6); + if (ans != ncclSuccess) return ans; + props->name = p6.name; + props->pciPath = p6.pciPath; + props->guid = p6.guid; + props->ptrSupport = p6.ptrSupport; + props->speed = p6.speed; + props->port = p6.port; + props->maxComms = p6.maxComms; + props->maxRecvs = p6.maxRecvs; + props->latency = p6.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + return ncclSuccess; +} + +static ncclResult_t ncclNet_v5_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { + return ncclNet_v5->connect(dev, handle, sendComm); +} + +static ncclResult_t ncclNet_v5_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { + return ncclNet_v5->accept(listenComm, recvComm); +} + // We use a wrapper around the v5 init to copy over the struct contents // post-init since they may not be initialized before hand. -static ncclResult_t ncclNet_v5_as_v6_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclNet_v5_as_v7_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclNet_v5->init(logfn)); - ncclNet_v5_as_v6.name = ncclNet_v5->name; - ncclNet_v5_as_v6.devices = ncclNet_v5->devices; - ncclNet_v5_as_v6.getProperties = ncclNet_v5->getProperties; - ncclNet_v5_as_v6.listen = ncclNet_v5->listen; - ncclNet_v5_as_v6.connect = ncclNet_v5->connect; - ncclNet_v5_as_v6.accept = ncclNet_v5->accept; - ncclNet_v5_as_v6.regMr = ncclNet_v5->regMr; - ncclNet_v5_as_v6.regMrDmaBuf = NULL; - ncclNet_v5_as_v6.deregMr = ncclNet_v5->deregMr; - ncclNet_v5_as_v6.isend = ncclNet_v5->isend; - ncclNet_v5_as_v6.irecv = ncclNet_v5->irecv; - ncclNet_v5_as_v6.iflush = ncclNet_v5->iflush; - ncclNet_v5_as_v6.test = ncclNet_v5->test; - ncclNet_v5_as_v6.closeSend = ncclNet_v5->closeSend; - ncclNet_v5_as_v6.closeRecv = ncclNet_v5->closeRecv; - ncclNet_v5_as_v6.closeListen = ncclNet_v5->closeListen; + ncclNet_v5_as_v7.name = ncclNet_v5->name; + ncclNet_v5_as_v7.devices = ncclNet_v5->devices; + ncclNet_v5_as_v7.getProperties = ncclNet_v5_as_v7_getProperties; + ncclNet_v5_as_v7.listen = ncclNet_v5->listen; + ncclNet_v5_as_v7.connect = ncclNet_v5_as_v7_connect; + ncclNet_v5_as_v7.accept = ncclNet_v5_as_v7_accept; + ncclNet_v5_as_v7.regMr = ncclNet_v5->regMr; + ncclNet_v5_as_v7.regMrDmaBuf = NULL; + ncclNet_v5_as_v7.deregMr = ncclNet_v5->deregMr; + ncclNet_v5_as_v7.isend = ncclNet_v5->isend; + ncclNet_v5_as_v7.irecv = ncclNet_v5->irecv; + ncclNet_v5_as_v7.iflush = ncclNet_v5->iflush; + ncclNet_v5_as_v7.test = ncclNet_v5->test; + ncclNet_v5_as_v7.closeSend = ncclNet_v5->closeSend; + ncclNet_v5_as_v7.closeRecv = ncclNet_v5->closeRecv; + ncclNet_v5_as_v7.closeListen = ncclNet_v5->closeListen; + ncclNet_v5_as_v7.getDeviceMr = NULL; + ncclNet_v5_as_v7.irecvConsumed = NULL; return ncclSuccess; } -static ncclResult_t ncclCollNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) { - ncclNetProperties_v4_t p4; - ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4); +static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) { + ncclNetProperties_v6_t p6; + ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6); if (ans != ncclSuccess) return ans; - props->name = p4.name; - props->pciPath = p4.pciPath; - props->guid = p4.guid; - props->ptrSupport = p4.ptrSupport; - props->speed = p4.speed; - props->port = p4.port; - props->maxComms = p4.maxComms; - props->maxRecvs = 1; - props->latency = 0; - return ncclSuccess; -} - -// We use a wrapper around the v4 init to copy over the struct contents -// post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v4_as_v6_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclCollNet_v4->init(logfn)); - ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name; - ncclCollNet_v4_as_v6.devices = ncclCollNet_v4->devices; - ncclCollNet_v4_as_v6.getProperties = ncclCollNet_v4_as_v6_getProperties; - ncclCollNet_v4_as_v6.listen = ncclCollNet_v4->listen; - ncclCollNet_v4_as_v6.connect = ncclCollNet_v4->connect; - ncclCollNet_v4_as_v6.reduceSupport = ncclCollNet_v4->reduceSupport; - ncclCollNet_v4_as_v6.regMr = ncclCollNet_v4->regMr; - ncclCollNet_v4_as_v6.regMrDmaBuf = NULL; - ncclCollNet_v4_as_v6.deregMr = ncclCollNet_v4->deregMr; - ncclCollNet_v4_as_v6.iallreduce = ncclCollNet_v4->iallreduce; - ncclCollNet_v4_as_v6.iflush = ncclCollNet_v4->iflush; - ncclCollNet_v4_as_v6.test = ncclCollNet_v4->test; - ncclCollNet_v4_as_v6.closeColl = ncclCollNet_v4->closeColl; - ncclCollNet_v4_as_v6.closeListen = ncclCollNet_v4->closeListen; + props->name = p6.name; + props->pciPath = p6.pciPath; + props->guid = p6.guid; + props->ptrSupport = p6.ptrSupport; + props->speed = p6.speed; + props->port = p6.port; + props->maxComms = p6.maxComms; + props->maxRecvs = p6.maxRecvs; + props->latency = p6.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclSuccess; } // We use a wrapper around the v5 init to copy over the struct contents // post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v5_as_v6_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclCollNet_v5_as_v7_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclCollNet_v5->init(logfn)); - ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name; - ncclCollNet_v5_as_v6.devices = ncclCollNet_v5->devices; - ncclCollNet_v5_as_v6.getProperties = ncclCollNet_v5->getProperties; - ncclCollNet_v5_as_v6.listen = ncclCollNet_v5->listen; - ncclCollNet_v5_as_v6.connect = ncclCollNet_v5->connect; - ncclCollNet_v5_as_v6.reduceSupport = ncclCollNet_v5->reduceSupport; - ncclCollNet_v5_as_v6.regMr = ncclCollNet_v5->regMr; - ncclCollNet_v5_as_v6.regMrDmaBuf = NULL; - ncclCollNet_v5_as_v6.deregMr = ncclCollNet_v5->deregMr; - ncclCollNet_v5_as_v6.iallreduce = ncclCollNet_v5->iallreduce; - ncclCollNet_v5_as_v6.iflush = ncclCollNet_v5->iflush; - ncclCollNet_v5_as_v6.test = ncclCollNet_v5->test; - ncclCollNet_v5_as_v6.closeColl = ncclCollNet_v5->closeColl; - ncclCollNet_v5_as_v6.closeListen = ncclCollNet_v5->closeListen; + ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name; + ncclCollNet_v5_as_v7.devices = ncclCollNet_v5->devices; + ncclCollNet_v5_as_v7.getProperties = ncclCollNet_v5_as_v7_getProperties; + ncclCollNet_v5_as_v7.listen = ncclCollNet_v5->listen; + ncclCollNet_v5_as_v7.connect = ncclCollNet_v5->connect; + ncclCollNet_v5_as_v7.reduceSupport = ncclCollNet_v5->reduceSupport; + ncclCollNet_v5_as_v7.regMr = ncclCollNet_v5->regMr; + ncclCollNet_v5_as_v7.regMrDmaBuf = NULL; + ncclCollNet_v5_as_v7.deregMr = ncclCollNet_v5->deregMr; + ncclCollNet_v5_as_v7.iallreduce = ncclCollNet_v5->iallreduce; + ncclCollNet_v5_as_v7.iflush = ncclCollNet_v5->iflush; + ncclCollNet_v5_as_v7.test = ncclCollNet_v5->test; + ncclCollNet_v5_as_v7.closeColl = ncclCollNet_v5->closeColl; + ncclCollNet_v5_as_v7.closeListen = ncclCollNet_v5->closeListen; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) { + ncclNetProperties_v6_t p6; + ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6); + if (ans != ncclSuccess) return ans; + props->name = p6.name; + props->pciPath = p6.pciPath; + props->guid = p6.guid; + props->ptrSupport = p6.ptrSupport; + props->speed = p6.speed; + props->port = p6.port; + props->maxComms = p6.maxComms; + props->maxRecvs = p6.maxRecvs; + props->latency = p6.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + return ncclSuccess; +} + +// We use a wrapper around the v5 init to copy over the struct contents +// post-init since they may not be initialized before hand. +static ncclResult_t ncclCollNet_v6_as_v7_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v6->init(logfn)); + ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name; + ncclCollNet_v6_as_v7.devices = ncclCollNet_v6->devices; + ncclCollNet_v6_as_v7.getProperties = ncclCollNet_v6_as_v7_getProperties; + ncclCollNet_v6_as_v7.listen = ncclCollNet_v6->listen; + ncclCollNet_v6_as_v7.connect = ncclCollNet_v6->connect; + ncclCollNet_v6_as_v7.reduceSupport = ncclCollNet_v6->reduceSupport; + ncclCollNet_v6_as_v7.regMr = ncclCollNet_v6->regMr; + ncclCollNet_v6_as_v7.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf; + ncclCollNet_v6_as_v7.deregMr = ncclCollNet_v6->deregMr; + ncclCollNet_v6_as_v7.iallreduce = ncclCollNet_v6->iallreduce; + ncclCollNet_v6_as_v7.iflush = ncclCollNet_v6->iflush; + ncclCollNet_v6_as_v7.test = ncclCollNet_v6->test; + ncclCollNet_v6_as_v7.closeColl = ncclCollNet_v6->closeColl; + ncclCollNet_v6_as_v7.closeListen = ncclCollNet_v6->closeListen; return ncclSuccess; } @@ -167,7 +215,7 @@ enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, n ncclResult_t ncclNetPluginInit() { char ncclNetPluginName[128]; - const char* envPluginName = getenv("NCCL_NET_PLUGIN"); + const char* envPluginName = ncclGetEnv("NCCL_NET_PLUGIN"); if (envPluginName && strlen(envPluginName)) { snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName); INFO(NCCL_INIT, "Plugin name set by env to %s", ncclNetPluginName); @@ -176,62 +224,97 @@ ncclResult_t ncclNetPluginInit() { } void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL); if (netPluginLib == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load (%s) returned %d : %s", ncclNetPluginName, errno, dlerror()); - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found, using internal implementation"); + // dlopen does not guarantee to set errno, but dlerror only gives us a + // string, so checking errno doesn't hurt to try to provide a better + // error message + if (errno == ENOENT) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : dlerror=%s No plugin found (%s), using internal implementation", dlerror(), ncclNetPluginName); + // exit(-1); + } else { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror()); + } return ncclSuccess; } - ncclNets[0] = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6"); + ncclNets[0] = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7"); if (ncclNets[0] == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol."); - // Try v5 plugin - ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); - if (ncclNet_v5 == nullptr) { - ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4"); - if (ncclNet_v4 == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (v4 or v5)."); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol."); + // Try v6 plugin + ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6"); + if (ncclNet_v6 == nullptr) { + // Try v5 plugin + ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); + if (ncclNet_v5 == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported."); if (netPluginLib != nullptr) dlclose(netPluginLib); return ncclSuccess; + } else { + ncclNets[0] = &ncclNet_v5_as_v7; + ncclNet_v5_as_v7.init = ncclNet_v5_as_v7_init; + // Set the name right away to allow for NCCL_NET=... to work + ncclNet_v5_as_v7.name = ncclNet_v5->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name); } - ncclNets[0] = &ncclNet_v4_as_v6; - ncclNet_v4_as_v6.init = ncclNet_v4_as_v6_init; - // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v4_as_v6.name = ncclNet_v4->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v4)", ncclNets[0]->name); } else { - ncclNets[0] = &ncclNet_v5_as_v6; - ncclNet_v5_as_v6.init = ncclNet_v5_as_v6_init; + ncclNets[0] = &ncclNet_v6_as_v7; + ncclNet_v6_as_v7.init = ncclNet_v6_as_v7_init; // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v5_as_v6.name = ncclNet_v5->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name); + ncclNet_v6_as_v7.name = ncclNet_v6->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name); } } // Check for CollNet - ncclCollNets[0] = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6"); + ncclCollNets[0] = (ncclCollNet_v7_t*) dlsym(netPluginLib, "ncclCollNetPlugin_v7"); if (ncclCollNets[0] == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol."); - ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5"); - if (ncclCollNet_v5 == nullptr) { - ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4"); - if (ncclCollNet_v4 == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5)."); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol."); + ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6"); + if (ncclCollNet_v6 == nullptr) { + ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5"); + if (ncclCollNet_v5 == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported."); } else { - ncclCollNets[0] = &ncclCollNet_v4_as_v6; - ncclCollNet_v4_as_v6.init = ncclCollNet_v4_as_v6_init; - ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v4)", ncclCollNets[0]->name); + ncclCollNets[0] = &ncclCollNet_v5_as_v7; + ncclCollNet_v5_as_v7.init = ncclCollNet_v5_as_v7_init; + ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name); } } else { - ncclCollNets[0] = &ncclCollNet_v5_as_v6; - ncclCollNet_v5_as_v6.init = ncclCollNet_v5_as_v6_init; - ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name); + ncclCollNets[0] = &ncclCollNet_v6_as_v7; + ncclCollNet_v6_as_v7.init = ncclCollNet_v6_as_v7_init; + ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v6)", ncclCollNets[0]->name); } } return ncclSuccess; } +ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) { + ncclNetProperties_t props; + + NCCLCHECK(net->getProperties(dev, &props)); + ncclNetDeviceType type = props.netDeviceType; + if (type) switch (type) { + case NCCL_NET_DEVICE_UNPACK: + if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) { + INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d", + props.netDeviceVersion); + return ncclSuccess; + } else { + WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it", + props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION); + return ncclInternalError; + } + default: + WARN("Unknown device code index"); + return ncclInternalError; + } + + INFO(NCCL_INIT, "Using non-device net plugin version %d", + props.netDeviceVersion); + return ncclSuccess; +} + static ncclResult_t netGetState(int i, enum ncclNetState* state) { pthread_mutex_lock(&netLock); if (ncclNetStates[i] == ncclNetStateInit) { @@ -268,6 +351,10 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) { NCCLCHECK(netGetState(i, &state)); if (state != ncclNetStateEnabled) continue; if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue; + if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) { + // Mismatched device plugin version + continue; + } comm->ncclNet = ncclNets[i]; ok = true; @@ -334,10 +421,10 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { } if (sComm == NULL) - NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm), ret, cleanup2); + NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm, NULL), ret, cleanup2); if (rComm == NULL) - NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm), ret, cleanup2); + NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2); connected = (rComm != NULL) && (sComm != NULL); } @@ -366,5 +453,11 @@ cleanup1: } int ncclNetVersion(struct ncclComm* comm) { - return (comm->ncclNet == &ncclNet_v4_as_v6) ? 4 : ((comm->ncclNet == &ncclNet_v5_as_v6) ? 5 : 6); + if (comm->ncclNet == &ncclNet_v5_as_v7) { + return 5; + } else if (comm->ncclNet == &ncclNet_v6_as_v7) { + return 6; + } else { + return 7; + } } diff --git a/src/proxy.cc b/src/proxy.cc index 9756c93dbb..976b1d3ba5 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -15,6 +15,16 @@ #include #include +#include +#include + +#define PROGRESS_RUNNING 0 +#define PROGRESS_REQUEST_STOP 1 +#define PROGRESS_ABORT 2 +#define PROGRESS_COMPLETE 3 + +#define SERVICE_RUNNING 0 +#define SERVICE_COMPLETE 1 enum { proxyRecv=0, proxySend=1 }; @@ -50,7 +60,7 @@ static void expectedProxyResponseFree(struct ncclProxyState* state) { } } -static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize) { +static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize, ncclResult_t res) { struct ncclExpectedProxyResponse* elem = state->expectedResponses; while (elem) { if (elem->opId == opId) { @@ -67,6 +77,7 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi memcpy(elem->respBuff, respBuff, respSize); free(respBuff); elem->done = true; + elem->res = res; return ncclSuccess; } elem = elem->next; @@ -84,6 +95,7 @@ static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, v // Pre-alloc response buffer ex->respBuff = malloc(respSize); ex->respSize = respSize; + ex->res = ncclInternalError; ex->done = false; // Enqueue @@ -109,10 +121,11 @@ static ncclResult_t expectedProxyResponseDequeue(struct ncclProxyState* state, v prev->next = elem->next; } memcpy(respBuff, elem->respBuff, elem->respSize); + ncclResult_t res = elem->res; free(elem->respBuff); free(elem); *found = 1; - return ncclSuccess; + return res; } prev = elem; elem = elem->next; @@ -509,7 +522,7 @@ static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex); return ncclInternalError; } - if (connector->transportComm->proxyProgress == NULL) return ncclSuccess; + if (connector->proxyConn.proxyProgress == NULL) return ncclSuccess; if (justInquire) *justInquire = true; else { @@ -707,13 +720,13 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int if (state->active == NULL) { pthread_mutex_lock(&pool->mutex); - while (pool->nextOps == -1 && !state->stop) { + while (pool->nextOps == -1 && state->stop == PROGRESS_RUNNING) { struct ncclProxyArgs profArgs; // Only used for profiling purposes ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep); pthread_cond_wait(&pool->cond, &pool->mutex); ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup); } - if (state->stop) { // We might have been woken up to stop. + if (state->stop != PROGRESS_RUNNING) { // We might have been woken up to stop. pthread_mutex_unlock(&pool->mutex); return ncclSuccess; } @@ -851,12 +864,13 @@ void* ncclProxyProgress(void *proxyState_) { * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */ int proxyOpAppendCounter = 0; struct ncclProxyArgs profArgs; // Only used for profiling purposes - while ((state->stop == false || (state->stop == true && state->active)) && *proxyState->abortFlag == 0) { + while (state->stop == PROGRESS_RUNNING || (state->stop == PROGRESS_REQUEST_STOP && state->active)) { int idle = 1; ncclResult_t ret = progressOps(proxyState, state, state->active, &idle); if (ret != ncclSuccess) { - INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); - return NULL; + __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE); + INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret); + continue; } if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle); if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive); @@ -864,11 +878,12 @@ void* ncclProxyProgress(void *proxyState_) { int added = 0; proxyOpAppendCounter = 0; TIME_START(3); - if (state->stop == false) + if (state->stop == PROGRESS_RUNNING) ret = ncclProxyGetPostedOps(proxyState, &added); if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); } if (ret != ncclSuccess) { - INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); + __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE); + INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret); } if (added == 0) { sched_yield(); // No request progressed. Let others run. @@ -876,6 +891,9 @@ void* ncclProxyProgress(void *proxyState_) { } lastIdle = idle; } + + /* progress serive thread should be waiting for me, I need to notify it. */ + __atomic_store_n(&state->stop, PROGRESS_COMPLETE, __ATOMIC_RELEASE); return NULL; } @@ -898,7 +916,11 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm) { static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) { struct ncclProxyProgressState* state = &proxyState->progressState; if (!state->thread) { - pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState); + pthread_attr_t attr; + SYSCHECK(pthread_attr_init(&attr), "pthread_attr_init"); + SYSCHECK(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED), "pthread_attr_setdetachstate"); + SYSCHECK(pthread_create(&state->thread, &attr, ncclProxyProgress, proxyState), "pthread_create"); + SYSCHECK(pthread_attr_destroy(&attr), "pthread_attr_destroy"); ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks); } return ncclSuccess; @@ -910,10 +932,17 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) { // Request the proxy to stop and then wake it if (state->opsPool) { pthread_mutex_lock(&state->opsPool->mutex); - state->stop = true; + if (*proxyState->abortFlag == 0) + state->stop = PROGRESS_REQUEST_STOP; + else + state->stop = PROGRESS_ABORT; pthread_cond_signal(&state->opsPool->cond); pthread_mutex_unlock(&state->opsPool->mutex); - pthread_join(state->thread, NULL); + /* progress thread is always detached, wait for it to exit. */ + uint64_t t0 = clockNano(); + while (__atomic_load_n(&state->stop, __ATOMIC_ACQUIRE) != PROGRESS_COMPLETE) { + if (clockNano() - t0 >= 1000) sched_yield(); + } } // Free off any memory allocated for the proxy arg pools @@ -1005,7 +1034,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in int ready, proxyRank = -1; struct ncclProxyState* sharedProxyState = comm->proxyState; - // Keep one connection per mlocal rank + // Keep one connection per local rank for (int i = 0; i < comm->localRanks; ++i) { /* find the proxy rank in comm. */ if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) { @@ -1058,42 +1087,43 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1; } } - INFO(NCCL_NET|NCCL_PROXY, "Connection to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection); + INFO(NCCL_NET|NCCL_PROXY, "Connected to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection); return ncclSuccess; } // cuMem API support // The response is sent out-of-band using ncclIpcSocket for this specific command -ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd) { +ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd) { ncclResult_t ret = ncclSuccess; ncclResult_t res = ncclInProgress; struct ncclIpcSocket ipcSock = { 0 }; - void* opId = malloc(1); + void *opId = (void*)((((uintptr_t)random()) << 32) | random()); + // Create a UDS socket to receive the converted fd NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag)); - // Request the conversion of the fd over sockets - NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgConvertFd, &fd, sizeof(int), 0, opId), ret, error); + // Request the allocation of a UDS fd for the handle over sockets + NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), 0, opId), ret, error); - // Receive converted fd over UDS - NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, convertedFd)); - TRACE(NCCL_PROXY, "UDS: ConvertFd rank %d returned %p %d", proxyConn->tpLocalRank, convertedFd, *convertedFd); - NCCLCHECK(ncclIpcSocketClose(&ipcSock)); + // Receive the converted fd over UDS + NCCLCHECKGOTO(ncclIpcSocketRecvFd(&ipcSock, convertedFd), ret, error); + TRACE(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx rank %d returned fd %d", *(uint64_t*)handle, proxyConn->tpLocalRank, *convertedFd); + NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), ret, error); + // Wait for proxy response (sockets) while (res == ncclInProgress) { res = ncclPollProxyResponse(comm, proxyConn, NULL, opId); } - free(opId); - return res; + return ret; error: NCCLCHECK(ncclIpcSocketClose(&ipcSock)); - WARN("ncclProxyClientConvertFd call to top parent rank %d failed", proxyConn->tpRank); + WARN("ncclProxyClientGetFd call to rank %d handle 0x%lx failed : %d", proxyConn->tpRank, *(uint64_t*)handle, ret); return ret; } -const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd" }; +const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" }; ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) { struct ncclSocket* sock; ncclResult_t ret = ncclSuccess; @@ -1132,14 +1162,13 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec // Check response queue int found = 0; - NCCLCHECK(expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found)); + ncclResult_t res = expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found); if (found == 0) { // Attempt to read in a new response header from the proxy thread struct ncclSocket* sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank; - - void* recvOpId; + ncclProxyRpcResponseHeader resp = {0}; int offset = 0; - if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)) { + if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &resp, sizeof(resp), &offset)) { WARN("Socket recv failed while polling for opId=%p", opId); return ncclInternalError; } @@ -1147,42 +1176,38 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec if (offset == 0) { return ncclInProgress; // If we've returned a partial response, block to receive the rest of it - } else if (offset < sizeof(recvOpId)) { - while (offset < sizeof(recvOpId)) - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)); + } else if (offset < sizeof(resp)) { + while (offset < sizeof(resp)) + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &resp, sizeof(resp), &offset)); } - INFO(NCCL_PROXY, "ncclPollProxyResponse Received new opId=%p", recvOpId); - - // Now do a blocking recv of the response size - int respSize = 0; - NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(respSize))); + INFO(NCCL_PROXY, "ncclPollProxyResponse Received new opId=%p", resp.opId); // If there's a respSize to recv - if (respSize > 0) { - if (recvOpId != opId) { + if (resp.respSize > 0) { + if (resp.opId != opId) { // Unexpected response, need to buffer the socket data - respBuff = malloc(respSize); + respBuff = malloc(resp.respSize); } assert(respBuff != NULL); - NCCLCHECK(ncclSocketRecv(sock, respBuff, respSize)); + NCCLCHECK(ncclSocketRecv(sock, respBuff, resp.respSize)); } - if (recvOpId == opId) { - INFO(NCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId); - NCCLCHECK(expectedProxyResponseRemove(sharedProxyState, recvOpId)); - return ncclSuccess; + if (resp.opId == opId) { + INFO(NCCL_PROXY, "resp.opId=%p matches expected opId=%p", resp.opId, opId); + NCCLCHECK(expectedProxyResponseRemove(sharedProxyState, resp.opId)); + return resp.res; } else { - INFO(NCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", recvOpId, respBuff, respSize); + INFO(NCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", resp.opId, respBuff, resp.respSize); // Store the result and mark response as completed - NCCLCHECK(expectedProxyResponseStore(sharedProxyState, recvOpId, respBuff, respSize)); + NCCLCHECK(expectedProxyResponseStore(sharedProxyState, resp.opId, respBuff, resp.respSize, resp.res)); return ncclInProgress; } } else { INFO(NCCL_PROXY, "ncclPollProxyResponse Dequeued cached opId=%p", opId); } - return ncclSuccess; + return res; } ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) { @@ -1284,38 +1309,52 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr } // cuMem API support -static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, int fd) { +static ncclResult_t proxyGetFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, uint64_t handle) { +#if CUDART_VERSION >= 11030 + // cuMem API support + ncclResult_t ret = ncclSuccess; struct ncclIpcSocket ipcSock = { 0 }; uint64_t hash = (uint64_t) opId; + INFO(NCCL_PROXY, "UDS proxyGetFd received handle 0x%lx peer %d opId %lx", handle, peer->tpLocalRank, hash); - INFO(NCCL_PROXY, "UDS proxyConvertFd received fd %d peer %d opId %lx", fd, peer->tpLocalRank, hash); + CUmemAllocationHandleType type = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + int fd = -1; + + CUCHECK(cuMemExportToShareableHandle(&fd, handle, type, 0)); // Send back the converted fd using UDS - NCCLCHECK(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag)); - NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash)); + NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, error); + NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash), ret, error); +error: NCCLCHECK(ncclIpcSocketClose(&ipcSock)); - return ncclSuccess; + // We can now safely close the exported fd + (void) close(fd); + return ret; +#else + return ncclInternalError; +#endif } static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclProxyState* proxyState, int* asyncOpCount, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool) { int done = 1; + ncclResult_t res = ncclInternalError; if (op->type == ncclProxyMsgSetup) { TRACE(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId); - NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); + res = op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done); } else if (op->type == ncclProxyMsgConnect) { TRACE(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff); - NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); + res = op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done); } else if (op->type == ncclProxyMsgSharedInit) { int nChannels = (int) *op->reqBuff; TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels); - if (op->connection->tcomm->proxySharedInit) NCCLCHECK(op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels)); + if (op->connection->tcomm->proxySharedInit) res = op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels); __atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE); - } else if (op->type == ncclProxyMsgConvertFd) { - int fd = *(int *)op->reqBuff; - TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgConvertFd opId=%p op.reqBuff=%p fd=%d", op->opId, op->reqBuff, fd); - NCCLCHECK(proxyConvertFd(peer, op->opId, proxyState, fd)); // cuMem API support + } else if (op->type == ncclProxyMsgGetFd) { + uint64_t handle = *(uint64_t*)op->reqBuff; + TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgGetFd opId=%p op.reqBuff=%p handle=0x%lx", op->opId, op->reqBuff, handle); + res = proxyGetFd(peer, op->opId, proxyState, handle); // cuMem API support } else if (op->type == ncclProxyMsgInit) { TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff); - NCCLCHECK(proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection)); + res = proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection); } else return ncclInternalError; if (done) { @@ -1329,11 +1368,10 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP * to abort and close the connection, it can cause segfault if the requester is using * the respBuff. */ - // Send the opId for referencing async operation - NCCLCHECK(ncclSocketSend(op->connection->sock, &op->opId, sizeof(op->opId))); + ncclProxyRpcResponseHeader resp = {op->opId, res, op->respSize}; - // Send the response size - NCCLCHECK(ncclSocketSend(op->connection->sock, &op->respSize, sizeof(op->respSize))); + // Send the opId for referencing async operation + NCCLCHECK(ncclSocketSend(op->connection->sock, &resp, sizeof(resp))); if (op->respSize) { // Send the response @@ -1386,7 +1424,7 @@ static bool proxyMatchOpType(int type) { case ncclProxyMsgSharedInit: case ncclProxyMsgSetup: case ncclProxyMsgConnect: - case ncclProxyMsgConvertFd: + case ncclProxyMsgGetFd: return true; default: return false; @@ -1544,6 +1582,19 @@ void* ncclProxyService(void* _args) { ncclSocketClose(proxyState->listenSock); free(proxyState->listenSock); proxyOpsFree(proxyState); + + if (*proxyState->abortFlag) { + /* abort happened, need to notify main thread I am done. */ + __atomic_store_n(&proxyState->stop, SERVICE_COMPLETE, __ATOMIC_RELEASE); + } + + if (ncclAtomicRefCountDecrement(proxyState->abortFlagRefCount) == 0) { + ncclCudaHostFree((void *)proxyState->abortFlag); + free((void*)proxyState->abortFlagRefCount); + } + + /* proxy itself holds one internal ref count, needs to call ncclProxyDestroy */ + ncclProxyDestroy(proxyState); return NULL; } @@ -1552,8 +1603,16 @@ ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union NCCLCHECK(ncclCalloc(&comm->sharedRes->proxyState, 1)); comm->proxyState = comm->sharedRes->proxyState; comm->proxyState->refCount = 1; + /* ref count for communicator and proxy service thread. */ + comm->proxyState->internalRefCount = 2; comm->proxyState->listenSock = sock; comm->proxyState->peerAddresses = peerAddresses; + // Seed the random number generator for UDS filename generation + struct timeval time; + gettimeofday(&time,NULL); + unsigned int seed = time.tv_sec*time.tv_usec; + seed ^= getpid(); + srandom(seed); return ncclSuccess; } @@ -1568,6 +1627,8 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) { proxyState->tpLocalnRanks = comm->localRanks; proxyState->cudaDev = comm->cudaDev; proxyState->abortFlag = comm->abortFlag; + proxyState->abortFlagRefCount = comm->abortFlagRefCount; + ncclAtomicRefCountIncrement(comm->abortFlagRefCount); proxyState->p2pnChannels = comm->p2pnChannels; proxyState->p2pChunkSize = comm->p2pChunkSize; proxyState->nChannels = comm->nChannels; @@ -1584,8 +1645,8 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) { } ncclResult_t ncclProxyStop(struct ncclComm* comm) { - if (comm->sharedRes && comm->sharedRes->proxyState) { - struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState; + if (comm->proxyState) { + struct ncclProxyState* sharedProxyState = comm->proxyState; if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) { if (sharedProxyState->peerAddresses) { @@ -1625,15 +1686,41 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) { return ncclSuccess; } -ncclResult_t ncclProxyDestroy(struct ncclComm* comm) { - struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState; - - assert(sharedProxyState->refCount == 0); - free(sharedProxyState->peerAddresses); - free(sharedProxyState->peerSocks); - free(sharedProxyState->proxyOps); - free(sharedProxyState->sharedDevMems); - expectedProxyResponseFree(sharedProxyState); - free(sharedProxyState); +ncclResult_t ncclProxyDestroy(struct ncclProxyState *proxyState) { + if (__atomic_sub_fetch(&proxyState->internalRefCount, 1, __ATOMIC_ACQ_REL) == 0) { + free(proxyState->peerAddresses); + free(proxyState->peerSocks); + free(proxyState->proxyOps); + free(proxyState->sharedDevMems); + expectedProxyResponseFree(proxyState); + free(proxyState); + } + return ncclSuccess; +} + +/* detach all proxy threads in case of abort */ +ncclResult_t ncclProxyTryDetach(struct ncclProxyState *proxyState) { + if (proxyState && proxyState->thread) { + /* proxy service thread can call cudaFreeHost to free pinned host mem, but + * it can cause a hang if main thread is issuing other cuda calls. To solution + * should be allocate/free pinned host mem using cuMem* driver API, this waiting + * 5 secs is just a workaround for now. */ + bool join = false; + struct timespec start, now; + clock_gettime(CLOCK_MONOTONIC, &start); + do { + clock_gettime(CLOCK_MONOTONIC, &now); + if (__atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE) == SERVICE_COMPLETE) { + /* proxy thread is done, join it. */ + pthread_join(proxyState->thread, NULL); + join = true; + break; + } + } while(now.tv_sec - start.tv_sec < 5); + + if (join == false) { + pthread_detach(proxyState->thread); + } + } return ncclSuccess; } diff --git a/src/transport.cc b/src/transport.cc index 9817beb183..c66a81ed7f 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -178,10 +178,32 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* } } - // Clear all connect masks and free each connectInfo array - for (int i=1; inRanks; i++) { + /* We need to sync ranks here since some ranks might run too fast after connection setup + * and start to destroy the connection after returning from this function; however, the + * others might still be trying to connect and import the buffer. No sync can lead to invalid + * shmem/cuda buffer. In addition, we also clear all connect masks and free each connectInfo array */ + for (int i = 1; i < comm->nRanks; i++) { + int bootstrapTag = (i << 8) + (graph ? graph->id + 1 : 0); int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; int sendPeer = (comm->rank + i) % comm->nRanks; + int flag = 0; + + if (recvPeer != sendPeer) { + if (comm->connectSend[sendPeer] != 0UL) + NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); + if (comm->connectRecv[recvPeer] != 0UL) + NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); + + if (comm->connectSend[sendPeer] != 0UL) + NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); + if (comm->connectRecv[recvPeer] != 0UL) + NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); + } else { + if (comm->connectSend[sendPeer] != 0UL || comm->connectRecv[recvPeer] != 0UL) { + NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); + NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); + } + } comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL; free(data[i]); } diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index f66abe8b41..04bab8b4f2 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -155,7 +155,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.tpLocalRank)); + send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; tpProxyRank = comm->topParentRanks[myInfo->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn)); ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount); @@ -177,7 +177,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph // Determine whether we need to flush the GDR buffer on recv or not if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.tpLocalRank)); + recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; tpProxyRank = comm->topParentRanks[myInfo->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn)); struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; @@ -224,6 +224,8 @@ struct collNetConnectArgs { struct ncclConnect* connectInfos; }; +static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args); + static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) { // We're on the same process as the proxy. We can pass a pointer to a struct. struct collNetConnectArgs args = { rank, nranks, connectInfos }; @@ -247,9 +249,14 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); + + send->proxyConn.proxyProgress = sendProxyProgress; + return ncclSuccess; } +static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args); + static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) { // We're on the same process as the proxy. We can pass a pointer to a struct. struct collNetConnectArgs args = { rank, nranks, connectInfos }; @@ -272,6 +279,9 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); } + + recv->proxyConn.proxyProgress = recvProxyProgress; + return ncclSuccess; } diff --git a/src/transport/net.cc b/src/transport/net.cc index 273d5d5e60..0998172f59 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -80,7 +80,7 @@ struct connectMap { } offsets; }; -struct sendResources { +struct sendNetResources { struct connectMap map; void* netSendComm; struct ncclSendMem* sendMem; @@ -103,9 +103,12 @@ struct sendResources { void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; uint64_t llLastCleaning; + int netDeviceVersion; + ncclNetDeviceType netDeviceType; + ncclNetDeviceHandle_t* netDeviceHandle; }; -struct recvResources { +struct recvNetResources { struct connectMap map; void* netListenComm; void* netRecvComm; @@ -132,6 +135,9 @@ struct recvResources { void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; uint64_t llLastCleaning; + int netDeviceVersion; + ncclNetDeviceType netDeviceType; + ncclNetDeviceHandle_t* netDeviceHandle; }; /* Determine if two peers can communicate with NET */ @@ -159,11 +165,14 @@ struct setupReq { int connIndex; }; +// Forward declaration +static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args); + /* Determine if we will use this transport for this peer and return connect * information for this peer */ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct setupReq req = { 0 }; - int localRank, tpProxyRank; + int tpProxyRank; send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; req.channelId = channelId; @@ -176,8 +185,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph tpProxyRank = comm->topParentRanks[proxyRank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn)); - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank)); - req.tpLocalRank = comm->topParentLocalRanks[localRank]; + req.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; req.tpRank = comm->topParentRanks[myInfo->rank]; req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); @@ -201,7 +209,6 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0); /* Setup recv connector */ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct setupReq req = { 0 }; - int localRank; recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; req.channelId = channelId; @@ -219,8 +226,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph tpProxyRank = comm->topParentRanks[myInfo->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn)); - NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank)); - req.tpLocalRank = comm->topParentLocalRanks[localRank]; + req.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; req.tpRank = comm->topParentRanks[myInfo->rank]; req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); @@ -267,6 +273,14 @@ static ncclResult_t netDumpMap(struct connectMap* map) { return ncclSuccess; } +struct netSendConnectArgs { + ncclNetHandle_t handle; +}; + +struct netRecvConnectArgs { + int proxyRank; +}; + static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { struct connectMap* map = (connectMap*) send->transportResources; @@ -279,7 +293,9 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne send->transportResources = map; opId = send; INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId); - NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId)); + netSendConnectArgs args = {0}; + memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t)); + NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId)); } else { opId = send; } @@ -293,15 +309,13 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne if (map->sameProcess && !ncclCuMemEnable()) { if (map->cudaDev != comm->cudaDev) { - if (!ncclCuMemEnable()) { - // Enable P2P access for Legacy IPC - cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0); - if (err == cudaErrorPeerAccessAlreadyEnabled) { - cudaGetLastError(); - } else if (err != cudaSuccess) { - WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err)); - return ncclInternalError; - } + // Enable P2P access for Legacy IPC + cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0); + if (err == cudaErrorPeerAccessAlreadyEnabled) { + cudaGetLastError(); + } else if (err != cudaSuccess) { + WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err)); + return ncclInternalError; } } } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) { @@ -339,9 +353,30 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); + + if (send->proxyConn.sameProcess) { + if (send->proxyConn.connection->netDeviceHandle) { + send->conn.netDeviceHandle = *send->proxyConn.connection->netDeviceHandle; + + for (int p=0; pconn.mhandles[p] = send->proxyConn.connection->mhandles[p]; + } + + if (send->proxyConn.connection->needsProxyProgress) { + send->proxyConn.proxyProgress = sendProxyProgress; + } else { + send->proxyConn.proxyProgress = NULL; + } + } else { + send->proxyConn.proxyProgress = sendProxyProgress; + } + return ncclSuccess; } +// Forward declare +static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args); + /* Connect to this peer */ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { struct connectMap* map = (connectMap*) recv->transportResources; @@ -353,7 +388,9 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne opId = recv; INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p", opId, &recv->proxyConn, connectInfo); - NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId)); + netRecvConnectArgs args = {0}; + args.proxyRank = *((int*)connectInfo); + NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(netRecvConnectArgs), sizeof(struct connectMap), opId)); } else { opId = recv; } @@ -378,6 +415,24 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); + + if (recv->proxyConn.sameProcess) { + if (recv->proxyConn.connection->netDeviceHandle) { + recv->conn.netDeviceHandle = *recv->proxyConn.connection->netDeviceHandle; + + for (int p=0; pconn.mhandles[p] = recv->proxyConn.connection->mhandles[p]; + } + + if (recv->proxyConn.connection->needsProxyProgress) { + recv->proxyConn.proxyProgress = recvProxyProgress; + } else { + recv->proxyConn.proxyProgress = NULL; + } + } else { + recv->proxyConn.proxyProgress = recvProxyProgress; + } + return ncclSuccess; } @@ -416,7 +471,7 @@ static ncclResult_t recvFree(struct ncclConnector* recv) { } #define NCCL_SHARED_STEPS 16 -static ncclResult_t sharedBuffersInit(struct ncclProxyState* proxyState, int cuda, int tpLocalRank, int type, int sameProcess, +static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int cuda, int tpLocalRank, int type, int sameProcess, int nChannels, char** gpuPtr, char** cpuPtr, int* size, ncclIpcDesc *ipcDesc) { if (cuda == 0 && sameProcess == 0) { WARN("PXN should not use host buffers for data"); @@ -462,7 +517,7 @@ static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int chan return ncclSuccess; } -static ncclResult_t sharedBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) { +static ncclResult_t sharedNetBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) { if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError); struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank]; if (peer == NULL) NCCLCHECK(ncclInternalError;) @@ -492,7 +547,7 @@ static ncclResult_t sharedBuffersDestroy(struct ncclProxyState* proxyState, int } static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels) { - NCCLCHECK(sharedBuffersInit(proxyState, 1, connection->tpLocalRank, 0, connection->sameProcess, nChannels, NULL, NULL, NULL, NULL)); + NCCLCHECK(sharedNetBuffersInit(proxyState, 1, connection->tpLocalRank, 0, connection->sameProcess, nChannels, NULL, NULL, NULL, NULL)); return ncclSuccess; } @@ -500,7 +555,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc struct setupReq* req = (struct setupReq*) reqBuff; if (reqSize != sizeof(struct setupReq)) return ncclInternalError; - struct sendResources* resources; + struct sendNetResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; @@ -517,6 +572,11 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc /* DMA-BUF support */ resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxRecvs = props.maxRecvs; + resources->netDeviceVersion = props.netDeviceVersion; + resources->netDeviceType = props.netDeviceType; + + resources->netDeviceVersion = props.netDeviceVersion; + resources->netDeviceType = props.netDeviceType; // We don't return any data if (respSize != 0) return ncclInternalError; @@ -528,7 +588,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc struct setupReq* req = (struct setupReq*) reqBuff; if (reqSize != sizeof(struct setupReq)) return ncclInternalError; - struct recvResources* resources; + struct recvNetResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; @@ -546,6 +606,8 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc /* DMA-BUF support */ resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxRecvs = props.maxRecvs; + resources->netDeviceVersion = props.netDeviceVersion; + resources->netDeviceType = props.netDeviceType; if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError; NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm)); @@ -554,11 +616,34 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc return ncclSuccess; } -static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { - struct sendResources* resources = (struct sendResources*)(connection->transportResources); - if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError; - ncclResult_t ret = ncclSuccess; +// This function embeds plugin-specific rules given the current versions +static ncclResult_t ncclNetGetDeviceHandle(ncclNetDeviceType type, int version, bool isRecv, ncclNetDeviceHandle_t** handle) { + bool needsDeviceHandle = false; + if (type == NCCL_NET_DEVICE_UNPACK) { + if (version == NCCL_NET_DEVICE_UNPACK_VERSION && isRecv) { + needsDeviceHandle = true; + } + } + + // Don't re-alloc netDeviceHandles + if (needsDeviceHandle && (*handle == NULL)) { + NCCLCHECK(ncclCalloc(handle, 1)); + (*handle)->netDeviceType = type; + (*handle)->netDeviceVersion = version; + } else if (!needsDeviceHandle) { + *handle = NULL; + } + + return ncclSuccess; +} + +static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources); + if (reqSize != sizeof(netSendConnectArgs)) return ncclInternalError; + ncclResult_t ret = ncclSuccess; + netSendConnectArgs* req = (netSendConnectArgs*) reqBuff; + NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, false /*isRecv*/, &resources->netDeviceHandle)); if (resources->shared) { // Shared buffers struct ncclProxyProgressState* progressState = &proxyState->progressState; @@ -577,15 +662,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank; - if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, comms->sendComm + resources->channelId); + if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle); resources->netSendComm = comms->sendComm[resources->channelId]; if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; } else { - ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm); + ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle); } } else { // Connect to remote peer - ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm); + ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle); connection->proxyAppendPtr = &connection->proxyAppend; } @@ -596,6 +681,13 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str } *done = 1; + if (resources->netDeviceHandle) { + connection->netDeviceHandle = resources->netDeviceHandle; + connection->needsProxyProgress = connection->netDeviceHandle->needsProxyProgress; + } else { + connection->needsProxyProgress = 1; + } + // Create structures struct connectMap* map = &resources->map; map->sameProcess = connection->sameProcess; @@ -611,7 +703,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str // Get shared buffers int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; - NCCLCHECK(sharedBuffersInit( + NCCLCHECK(sharedNetBuffersInit( proxyState, resources->useGdr, resources->tpLocalRank, 0, map->sameProcess, proxyState->p2pnChannels, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipcDesc)); resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; @@ -679,6 +771,10 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str { NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); } + + // Copy the mhandle dptr, if implemented + if (resources->netDeviceHandle && proxyState->ncclNet->getDeviceMr) + NCCLCHECK(proxyState->ncclNet->getDeviceMr(resources->netSendComm, resources->mhandles[p], &connection->mhandles[p])); } } @@ -689,11 +785,13 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str } static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { - if (reqSize != sizeof(int)) return ncclInternalError; - struct recvResources* resources = (struct recvResources*)(connection->transportResources); - resources->tpRemoteProxyRank = *(int*)reqBuff; + if (reqSize != sizeof(netRecvConnectArgs)) return ncclInternalError; + struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources); + netRecvConnectArgs* req = (netRecvConnectArgs*) reqBuff; + resources->tpRemoteProxyRank = req->proxyRank; ncclResult_t ret = ncclSuccess; + NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, true /*isRecv*/, &resources->netDeviceHandle)); // Finish connection establishment from remote peer if (resources->shared) { // Shared buffers @@ -713,15 +811,15 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank; - if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId); + if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId, &resources->netDeviceHandle); resources->netRecvComm = comms->recvComm[resources->channelId]; if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++; } else { - ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm); + ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle); } } else { // Connect to remote peer - ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm); + ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle); connection->proxyAppendPtr = &connection->proxyAppend; } @@ -732,6 +830,13 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str } *done = 1; + if (resources->netDeviceHandle) { + connection->netDeviceHandle = resources->netDeviceHandle; + connection->needsProxyProgress = connection->netDeviceHandle->needsProxyProgress; + } else { + connection->needsProxyProgress = 1; + } + NCCLCHECK(proxyState->ncclNet->closeListen(resources->netListenComm)); // Create structures @@ -749,7 +854,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str // Get shared buffers int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; - NCCLCHECK(sharedBuffersInit( + NCCLCHECK(sharedNetBuffersInit( proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL)); resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; @@ -809,6 +914,10 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str { NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); } + + // Copy the mhandle dptr + if (resources->netDeviceType != NCCL_NET_DEVICE_HOST && proxyState->ncclNet->getDeviceMr) + NCCLCHECK(proxyState->ncclNet->getDeviceMr(resources->netRecvComm, resources->mhandles[p], &connection->mhandles[p])); } } @@ -819,9 +928,9 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str } static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { - struct sendResources* resources = (struct sendResources*)(connection->transportResources); + struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources); if (connection->state == connSharedInitialized) { // NVB Preconnect - NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 0, connection)); + NCCLCHECK(sharedNetBuffersDestroy(proxyState, connection->tpLocalRank, 0, connection)); return ncclSuccess; } @@ -846,7 +955,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct } if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); if (resources->shared) { - NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 0, connection)); + NCCLCHECK(sharedNetBuffersDestroy(proxyState, resources->tpLocalRank, 0, connection)); if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev]+resources->tpRemoteRank; comms->sendRefCount[resources->channelId]--; @@ -864,9 +973,9 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct } static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { - struct recvResources* resources = (struct recvResources*)(connection->transportResources); + struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources); if (connection->state == connSharedInitialized) { // NVB Preconnect - NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 1, connection)); + NCCLCHECK(sharedNetBuffersDestroy(proxyState, connection->tpLocalRank, 1, connection)); return ncclSuccess; } @@ -887,7 +996,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct } if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); if (resources->shared) { - NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 1, connection)); + NCCLCHECK(sharedNetBuffersDestroy(proxyState, resources->tpLocalRank, 1, connection)); if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev] + resources->tpRemoteProxyRank; comms->recvRefCount[resources->channelId]--; @@ -910,7 +1019,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); + struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->transmitted = sub->done = 0; @@ -925,7 +1034,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; if (sub->done == sub->nsteps) continue; - struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); + struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources); void* mhandle = resources->mhandles[p]; int stepSize = resources->buffSizes[p] / NCCL_STEPS; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); @@ -1044,7 +1153,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct } else if (s>0) { // Find next sub with the same recvComm int next; for (next=s; nextnsubs; next++) { - struct recvResources* nextRes = (struct recvResources*) (args->subs[next].connection->transportResources); + struct recvNetResources* nextRes = (struct recvNetResources*) (args->subs[next].connection->transportResources); if (nextRes->netRecvComm == recvComm) break; } if (next == args->nsubs) { // Not found @@ -1057,7 +1166,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct } } groupSize++; - struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); maxRecvs = resources->maxRecvs; recvComm = resources->netRecvComm; // Round to next multiple of sliceSteps @@ -1084,7 +1193,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct struct ncclProxySubArgs* sub = subGroup + i; if (sub->posted < sub->nsteps) { if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; } - struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); int stepSize = resources->buffSizes[p] / NCCL_STEPS; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; @@ -1107,10 +1216,12 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct } if (subCount) { uint64_t step = subGroup->posted; - struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); + struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources); void** requestPtr = subGroup->requests+(step%NCCL_STEPS); NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr)); if (*requestPtr) { + subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr; + subGroup->recvRequestsSubCount = subCount; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup+i; sub->posted += args->sliceSteps; @@ -1141,14 +1252,14 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct sub->received += args->sliceSteps; for (uint64_t step=sub->received-args->sliceSteps; stepreceived; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait); if (step < sub->nsteps) { - struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); if (resources->useGdr) needFlush |= resources->needFlush; } } subGroup->requests[step%NCCL_STEPS] = NULL; if (totalSize > 0 && p == NCCL_PROTO_SIMPLE && needFlush) { // GDRCOPY support - struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); + struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources); if (resources->gdcFlush) { #if defined (__x86_64__) // Force a PCI-E read from GPU memory @@ -1162,7 +1273,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; if (step < sub->nsteps) { - struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); int stepSize = resources->buffSizes[p] / NCCL_STEPS; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; @@ -1171,7 +1282,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct subCount++; } } - struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); + struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources); NCCLCHECK(proxyState->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS))); } } @@ -1195,7 +1306,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct for (uint64_t step=sub->transmitted-args->sliceSteps; steptransmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait); if (step < sub->nsteps) { __sync_synchronize(); - struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail; *recvTail = sub->base + sub->transmitted; if (resources->gdcSync) wc_store_fence(); // Flush out WC write @@ -1213,17 +1324,23 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct struct ncclProxySubArgs* sub = subGroup + i; if (sub->done == sub->nsteps) continue; if (sub->transmitted > sub->done) { - struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); volatile uint64_t* sendHead = &resources->sendMem->head; uint64_t done = *sendHead; while (done > sub->base + sub->done && // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted. sub->transmitted > sub->done) { + if (subGroup->recvRequestsCache[sub->done%NCCL_STEPS]) { + // the multirecv requests are only cached in the first sub. + if (proxyState->ncclNet->irecvConsumed) + NCCLCHECK(proxyState->ncclNet->irecvConsumed(resources->netRecvComm, subGroup->recvRequestsSubCount, subGroup->recvRequestsCache[sub->done%NCCL_STEPS])); + subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL; + } sub->done += args->sliceSteps; for (uint64_t step=sub->done-args->sliceSteps; stepdone; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd); args->idle = 0; if (sub->done == sub->nsteps) { - struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); resources->step = sub->base + sub->nsteps; args->done++; break; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 421f0a13a1..8d4313dddc 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -333,6 +333,8 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) { props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort; props->maxComms = ncclIbDevs[dev].maxQp; props->maxRecvs = NCCL_NET_IB_MAX_RECVS; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclSuccess; } @@ -348,6 +350,10 @@ struct ncclIbQpInfo { uint8_t link_layer; uint32_t qpn[NCCL_IB_MAX_QPS]; + // Fields needed for ece (enhanced connection establishment) + struct ibv_ece ece[NCCL_IB_MAX_QPS]; + int ece_supported[NCCL_IB_MAX_QPS]; + // For RoCE uint64_t spn; uint64_t iid; @@ -608,7 +614,7 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { return ncclSuccess; } -ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { +ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; struct ncclIbCommStage* stage = &handle->stage; struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm; @@ -652,7 +658,13 @@ ib_connect_check: NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr)); struct ncclIbQpInfo qpInfo; qpInfo.ib_port = ib_port; - for (int q=0; qnqps; q++) qpInfo.qpn[q] = comm->qps[q]->qp_num; + for (int q=0; qnqps; q++) { + qpInfo.qpn[q] = comm->qps[q]->qp_num; + + // Query ece capabilities (enhanced connection establishment) + NCCLCHECK(wrap_ibv_query_ece(comm->qps[q], &qpInfo.ece[q], &qpInfo.ece_supported[q])); + } + qpInfo.mtu = portAttr.active_mtu; // Prepare my fifo @@ -663,15 +675,20 @@ ib_connect_check: // RoCE support qpInfo.lid = portAttr.lid; qpInfo.link_layer = comm->gidInfo.link_layer = portAttr.link_layer; - if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB - for (int q=0; qnqps; q++) - INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid); - } else { // RoCE - NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &comm->gidInfo.localGid)); + if (qpInfo.link_layer == IBV_LINK_LAYER_ETHERNET) { + NCCLCHECK(wrap_ibv_query_gid(ncclIbDevs[dev].context, ncclIbDevs[dev].port, ncclParamIbGidIndex(), &comm->gidInfo.localGid)); qpInfo.spn = comm->gidInfo.localGid.global.subnet_prefix; qpInfo.iid = comm->gidInfo.localGid.global.interface_id; + } + + if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB for (int q=0; qnqps; q++) - INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); + INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ncclIbDevs[dev].port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid); + } else { // RoCE + for (int q=0; qnqps; q++) + INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX)", + dev, ncclIbDevs[dev].port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.ece_supported[q], qpInfo.ece[q].vendor_id, qpInfo.ece[q].options, qpInfo.ece[q].comp_mask, ncclParamIbGidIndex(), + qpInfo.spn, qpInfo.iid); } stage->state = ncclIbCommStateSend; @@ -699,10 +716,19 @@ ib_connect: comm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid; for (int q=0; qnqps; q++) { struct ibv_qp* qp = comm->qps[q]; + if (remQpInfo.ece_supported[q] && qpInfo.ece_supported[q]) + NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo.ece[q], &qpInfo.ece_supported[q])); + NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo)); NCCLCHECK(ncclIbRtsQp(qp)); } + if (qpInfo.link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE + for (int q=0; qnqps; q++) + INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d set_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}", + dev, ncclIbDevs[dev].port, qpInfo.qpn[q], remQpInfo.ece_supported[q], remQpInfo.ece[q].vendor_id, remQpInfo.ece[q].options, remQpInfo.ece[q].comp_mask); + } + comm->ready = 1; stage->state = ncclIbCommStateConnected; stage->offset = 0; @@ -720,7 +746,7 @@ ib_send_ready: NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0); -ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { +ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm; struct ncclIbCommStage* stage = &lComm->stage; struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm; @@ -781,8 +807,21 @@ ib_recv: remQpInfo.mtu = (enum ibv_mtu)std::min(remQpInfo.mtu, portAttr.active_mtu); // Setup QP + struct ncclIbQpInfo qpInfo; for (int q=0; qnqps; q++) { struct ibv_qp* qp = rComm->qps[q]; + + // Set the ece (enhanced connection establishment) on this QP before RTR + if (remQpInfo.ece_supported[q]) { + NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo.ece[q], &qpInfo.ece_supported[q])); + + // Query the reduced ece for this QP (matching enhancements between the requestor and the responder) + // Store this in our own qpInfo for returning to the requestor + if (qpInfo.ece_supported[q]) { + NCCLCHECK(wrap_ibv_query_ece(qp, &qpInfo.ece[q], &qpInfo.ece_supported[q])); + } + } + NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo)); NCCLCHECK(ncclIbRtsQp(qp)); } @@ -815,7 +854,6 @@ ib_recv: } // Fill Handle - struct ncclIbQpInfo qpInfo; qpInfo.lid=portAttr.lid; qpInfo.link_layer= rComm->gidInfo.link_layer = portAttr.link_layer; qpInfo.ib_port=ib_port; @@ -1380,6 +1418,8 @@ ncclNet_t ncclNetIb = { ncclIbTest, ncclIbCloseSend, ncclIbCloseRecv, - ncclIbCloseListen + ncclIbCloseListen, + NULL /* getDeviceMr */, + NULL /* irecvConsumed */ }; diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index 08a8c3a293..502179a217 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -101,6 +101,8 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) { props->port = 0; props->maxComms = 65536; props->maxRecvs = 1; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; return ncclSuccess; } @@ -301,7 +303,7 @@ ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) return ncclSuccess; } -ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm) { +ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev return ncclInternalError; } @@ -346,7 +348,7 @@ socket_send: return ncclSuccess; } -ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm) { +ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { struct ncclNetSocketListenComm* lComm = (struct ncclNetSocketListenComm*)listenComm; struct ncclNetSocketCommStage* stage = &lComm->stage; struct ncclNetSocketComm* rComm = stage->comm; @@ -609,5 +611,7 @@ ncclNet_t ncclNetSocket = { ncclNetSocketTest, ncclNetSocketClose, ncclNetSocketClose, - ncclNetSocketCloseListen + ncclNetSocketCloseListen, + NULL /* getDeviceMr */, + NULL /* irecvConsumed */ }; diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc index 07be99d9fe..c9a3bbc289 100644 --- a/src/transport/nvls.cc +++ b/src/transport/nvls.cc @@ -10,17 +10,30 @@ #include "graph.h" #include "utils.h" #include "proxy.h" +#include "enqueue.h" #if CUDART_VERSION >= 12010 -// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange -#define USE_POSIX_FD 1 +struct graphRegData { + uintptr_t offset; + size_t size; +}; -#if USE_POSIX_FD -#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR -#else -#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE -#endif +struct localRegData { + /* Registration record data */ + uintptr_t recSendbuff, recRecvbuff; + intptr_t recSendOffset, recRecvOffset; + /* Registration request data */ + uintptr_t reqSendbuff, reqRecvbuff; + size_t reqSendSize, reqRecvSize; + intptr_t reqSendOffset, reqRecvOffset; +}; + +struct localRequestData { + uintptr_t reqBuff; + size_t reqSize; + intptr_t reqOffset; +}; ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { // This transport cannot be used for p2p @@ -66,24 +79,23 @@ ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* return ncclSuccess; } -ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, unsigned int nranks, char* shareableHandle) { - size_t size = resources->size; +ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) { + size_t size = prop->size; // Create a Multicast group - CUmulticastObjectProp* prop = &resources->properties; INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zi on rank %d", nranks, size, rank); - CUCHECK(cuMulticastCreate(&resources->mcHandle, prop)); + CUCHECK(cuMulticastCreate(mcHandle, prop)); - if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) { + if ((NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) && (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)) { // Get a handle to pass to other ranks - CUCHECK(cuMemExportToShareableHandle(shareableHandle, resources->mcHandle, NVLS_CU_MEM_HANDLE_TYPE, 0)); + CUCHECK(cuMemExportToShareableHandle(shareableHandle, *mcHandle, NVLS_CU_MEM_HANDLE_TYPE, 0)); } else { - memcpy(shareableHandle, &resources->mcHandle, sizeof(resources->mcHandle)); + memcpy(shareableHandle, mcHandle, sizeof(CUmemGenericAllocationHandle)); } - INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zi on rank %d", resources->mcHandle, nranks, size, rank); + INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zi on rank %d", *mcHandle, nranks, size, rank); return ncclSuccess; } @@ -94,7 +106,7 @@ ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes* return ncclSuccess; } -ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, char* shareableHandle) { +ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) { CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE; INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank); @@ -102,36 +114,27 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct ncclNvlsSharedRes* r // Import and map the remote memory descriptor to the local GPU if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // cuMem UDS support - int fd = *(int *)shareableHandle; - TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle from rank %d fd %d", comm->localRank, rank, fd); + int fd = -1; + TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle %p from rank %d", comm->localRank, shareableHandle, rank); struct ncclProxyConnector proxyConn; int tpProxyRank = comm->topParentRanks[rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &proxyConn)); - TRACE(NCCL_NVLS, "NVLS rank %d request conversion of fd %d from rank %d", comm->localRank, fd, rank); - NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, (int *)shareableHandle)); - fd = *(int *)shareableHandle; + TRACE(NCCL_NVLS, "NVLS rank %d request conversion of handle 0x%lx from rank %d", comm->localRank, *(uint64_t*)shareableHandle, rank); + NCCLCHECK(ncclProxyClientGetFdBlocking(comm, &proxyConn, shareableHandle, &fd)); TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank); - CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)(uintptr_t)fd, type)); + CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type)); + (void) close(fd); } else { if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) { - CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)shareableHandle, type)); + CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type)); } else { - memcpy(&resources->mcHandle, shareableHandle, sizeof(resources->mcHandle)); + memcpy(mcHandle, shareableHandle, sizeof(CUmemGenericAllocationHandle)); } } return ncclSuccess; } ncclResult_t nvlsGroupDisconnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { - CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE; - - // Import and map the remote memory descriptor to the local GPU - if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { - // cuMem UDS support - int fd = *(int *)resources->shareableHandle; - (void) close(fd); - } - return ncclSuccess; } @@ -147,6 +150,7 @@ ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* r prop.location.id = resources->dev; prop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE; CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); + resources->ucGran = granularity; // Map a VA for UC memory CUCHECK(cuMemAddressReserve(&ptr, size, granularity, 0U, 0)); @@ -181,6 +185,14 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct ncclNvlsSharedRes* re return ncclSuccess; } +ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { + CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, size)); + CUCHECK(cuMemUnmap(ptr, size)); + CUCHECK(cuMemAddressFree(ptr, size)); + CUCHECK(cuMemRelease(*mcHandler)); + return ncclSuccess; +} + ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { size_t size = resources->size; CUdeviceptr ptr = 0; @@ -263,6 +275,9 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { int nHeads = comm->channels[0].nvls.nHeads; int headRank = comm->channels[0].nvls.headRank; + char shmPath[sizeof("/dev/shm/nccl-XXXXXX")]; + uintptr_t *nvlsShmem = NULL; + size_t typeSize; CUdevice dev; CUCHECK(cuCtxGetDevice(&dev)); @@ -313,11 +328,11 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { char* shareableHandle = resources->shareableHandle; NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, comm->localRanks, nvlsTotalSize), res, cleanup); if (comm->localRank == 0) { - NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, comm->localRank, comm->localRanks, shareableHandle), res, cleanup); + NCCLCHECKGOTO(nvlsGroupCreate(comm, &resources->properties, comm->localRank, comm->localRanks, &resources->mcHandle, shareableHandle), res, cleanup); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup); - NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, comm->localRankToRank[0], shareableHandle), res, cleanup); + NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &resources->mcHandle), res, cleanup); } NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup); @@ -374,6 +389,23 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { } } + /* create shared memory for fast NVLS buffer registration */ + typeSize = sizeof(struct localRegData); + if (comm->localRank == 0) { + shmPath[0] = '\0'; + NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsShmemHandle), res, cleanup); + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, cleanup); + } else { + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, cleanup); + NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsShmemHandle), res, cleanup); + } + /* need 2 pools and a shared counter for shmem-based collectives */ + comm->nvlsShmem.cnt[0] = (size_t*)nvlsShmem; + comm->nvlsShmem.ptr[0] = (void*)((char*)comm->nvlsShmem.cnt[0] + sizeof(size_t)); + comm->nvlsShmem.cnt[1] = (size_t*)((char*)comm->nvlsShmem.ptr[0] + typeSize * comm->localRanks); + comm->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsShmem.cnt[1] + sizeof(size_t)); + comm->nvlsShmem.round = 0; + return res; cleanup: @@ -394,6 +426,371 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) { return ncclSuccess; } +ncclResult_t tryRegisterBuffer(struct ncclComm *comm, struct localRequestData *reqData, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) { + ncclResult_t ret = ncclSuccess; + struct ncclRegRecord *regRecord = NULL; + struct localRequestData *myReqData = &reqData[comm->localRank]; + CUdeviceptr regPtr = 0; + CUmulticastObjectProp prop; + char shareableHandle[NVLS_HANDLE_SIZE]; + CUmemGenericAllocationHandle mcHandle; + size_t granularity; + size_t minSize; + bool localRegBufUsed = false; + + /* get minimal size of nvls buffers */ + minSize = reqData[0].reqSize; + for (int i = 1; i < comm->localRanks; ++i) { + if (minSize > reqData[i].reqSize) + minSize = reqData[i].reqSize; + } + + /* start registration */ + memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp)); + prop.size = minSize; + CUCHECKGOTO(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); + if (comm->localRank == 0) { + NCCLCHECKGOTO(nvlsGroupCreate(comm, &prop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail); + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); + } else { + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); + NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &mcHandle), ret, fail); + } + + CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail); + CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)myReqData->reqBuff, minSize, 0), ret, fail); + + // Create a VA for the NVLS + CUCHECKGOTO(cuMemAddressReserve(®Ptr, minSize, granularity, 0U, 0), ret, fail); + // Map the VA locally + CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail); + + NCCLCHECKGOTO(ncclCalloc(®Record, 1), ret, fail); + regRecord->buff = myReqData->reqBuff; + regRecord->size = myReqData->reqSize; + regRecord->regAddr = regPtr; + regRecord->regSize = minSize; + regRecord->dev = comm->nvlsResources->dev; + regRecord->mcHandle = mcHandle; + /* get all buffer addresses */ + NCCLCHECKGOTO(ncclCalloc(®Record->addrs, comm->localRanks), ret, fail); + regRecord->addrs[comm->localRank] = regRecord->buff; + NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsShmem, regRecord->addrs + comm->localRank, regRecord->addrs, sizeof(uintptr_t)), ret, fail); + /* enqueue record */ + ncclIntruQueueEnqueue(&comm->regRecordQueue, regRecord); + + localRegBufUsed = true; + +exit: + if (localRegBufUsed) + *regAddr = (uintptr_t)regPtr + userBuff - myReqData->reqBuff; + *regUsed = localRegBufUsed; + return ret; +fail: + localRegBufUsed = false; + goto exit; +} + +ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { + ncclResult_t ret = ncclSuccess; + bool localRegBufUsed = false; + struct localRegData *regData = NULL; + struct localRequestData *reqData = NULL; + struct ncclRegRecord *regRecordHead = NULL, *sendRegRecord = NULL, *recvRegRecord = NULL; + struct ncclRegRequest *regRequestHead = NULL, *sendRegRequest = NULL, *recvRegRequest = NULL; + bool sendNeedReg = false, recvNeedReg = false; + CUdeviceptr regSendPtr = 0; + CUdeviceptr regRecvPtr = 0; + + *outRegBufUsed = false; + + NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks), ret, fail); + + /* first check whether the buffer has been registered and matches each other globally */ + regRecordHead = ncclIntruQueueHead(&comm->regRecordQueue); + while (regRecordHead && ((sendRegRecord == NULL && sendbuff != NULL) || (recvRegRecord == NULL && recvbuff != NULL))) { + /* check send reg record */ + if (sendRegRecord == NULL && regRecordHead->buff <= (uintptr_t)sendbuff && + regRecordHead->buff + regRecordHead->size >= (uintptr_t)sendbuff + sendbuffSize) { + regData[comm->localRank].recSendbuff = regRecordHead->buff; + regData[comm->localRank].recSendOffset = (uintptr_t)sendbuff - regRecordHead->buff; + sendRegRecord = regRecordHead; + } + + /* check recv reg record */ + if (recvRegRecord == NULL && regRecordHead->buff <= (uintptr_t)recvbuff && + regRecordHead->buff + regRecordHead->size >= (uintptr_t)recvbuff + recvbuffSize) { + regData[comm->localRank].recRecvbuff = regRecordHead->buff; + regData[comm->localRank].recRecvOffset = (uintptr_t)recvbuff - regRecordHead->buff; + recvRegRecord = regRecordHead; + } + regRecordHead = regRecordHead->next; + } + + /* prepare registration request for later reference */ + regRequestHead = ncclIntruQueueHead(&comm->regRequestQueue); + while (regRequestHead && ((sendRegRequest == NULL && sendbuff != NULL) || (recvRegRequest == NULL && recvbuff != NULL))) { + /* check send reg request */ + if (regRequestHead->buff <= (uintptr_t)sendbuff && + regRequestHead->buff + regRequestHead->size >= (uintptr_t)sendbuff + sendbuffSize) { + regData[comm->localRank].reqSendbuff = regRequestHead->buff; + regData[comm->localRank].reqSendSize = regRequestHead->size; + regData[comm->localRank].reqSendOffset = (uintptr_t)sendbuff - regRequestHead->buff; + sendRegRequest = regRequestHead; + } + + /* check recv reg request */ + if (regRequestHead->buff <= (uintptr_t)recvbuff && + regRequestHead->buff + regRequestHead->size >= (uintptr_t)recvbuff + recvbuffSize) { + regData[comm->localRank].reqRecvbuff = regRequestHead->buff; + regData[comm->localRank].reqRecvSize = regRequestHead->size; + regData[comm->localRank].reqRecvOffset = (uintptr_t)recvbuff - regRequestHead->buff; + recvRegRequest = regRequestHead; + } + regRequestHead = regRequestHead->next; + } + + NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsShmem, regData + comm->localRank, regData, sizeof(struct localRegData)), ret, fail); + + /* first check whether all local ranks find their registered buffer */ + for (int i = 0; i < comm->localRanks; ++i) { + if (regData[i].recSendbuff == 0 || sendRegRecord->addrs[i] != regData[i].recSendbuff) { + sendNeedReg = true; + } + + if (regData[i].recRecvbuff == 0 || recvRegRecord->addrs[i] != regData[i].recRecvbuff) { + recvNeedReg = true; + } + } + + if (sendNeedReg == false) { + for (int i = 0; i < comm->localRanks - 1; ++i) { + if (regData[i].recSendOffset != regData[i + 1].recSendOffset) { + /* offset are different, we cannot apply user buffer registration */ + goto fail; + } + } + + /* reuse previous registered buffer if possible */ + if (!sendNeedReg) + regSendPtr = (CUdeviceptr)((uintptr_t)sendRegRecord->regAddr + regData[comm->localRank].recSendOffset); + } + + if (recvNeedReg == false) { + for (int i = 0; i < comm->localRanks - 1; ++i) { + if (regData[i].recRecvOffset != regData[i + 1].recRecvOffset) { + goto fail; + } + } + + if (!recvNeedReg) + regRecvPtr = (CUdeviceptr)((uintptr_t)recvRegRecord->regAddr + regData[comm->localRank].recRecvOffset); + } + + if ((!sendNeedReg || sendbuff == NULL) && (!recvNeedReg || recvbuff == NULL)) { + localRegBufUsed = true; + INFO(NCCL_NVLS, "rank %d reuse local-registered sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr); + goto exit; + } + + /* Start Registration. Not found registered buffers, then check whether both send and recv buffer locate + * in register request cache. */ + NCCLCHECKGOTO(ncclCalloc(&reqData, comm->localRanks), ret, fail); + if (sendNeedReg && sendbuff != NULL) { + /* copy request data got from previous shmem AG */ + intptr_t offset = regData[0].reqSendOffset; + for (int i = 0; i < comm->localRanks; ++i) { + if (regData[i].reqSendbuff == 0 || offset != regData[i].reqSendOffset) goto fail; + reqData[i].reqBuff = regData[i].reqSendbuff; + reqData[i].reqSize = regData[i].reqSendSize; + reqData[i].reqOffset = regData[i].reqSendOffset; + } + tryRegisterBuffer(comm, reqData, (uintptr_t)sendbuff, sendbuffSize, ®SendPtr, &localRegBufUsed); + if (localRegBufUsed == false) goto fail; + } + + if (recvNeedReg && recvbuff != NULL) { + intptr_t offset = regData[0].reqRecvOffset; + for (int i = 0; i < comm->localRanks; ++i) { + if (regData[i].reqRecvbuff == 0 || offset != regData[i].reqRecvOffset) goto fail; + reqData[i].reqBuff = regData[i].reqRecvbuff; + reqData[i].reqSize = regData[i].reqRecvSize; + reqData[i].reqOffset = regData[i].reqRecvOffset; + } + tryRegisterBuffer(comm, reqData, (uintptr_t)recvbuff, recvbuffSize, ®RecvPtr, &localRegBufUsed); + if (localRegBufUsed == false) goto fail; + } + + INFO(NCCL_NVLS, "rank %d successfully local-registered sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr); + +exit: + *outRegBufSend = (void*)regSendPtr; + *outRegBufRecv = (void*)regRecvPtr; + *outRegBufUsed = localRegBufUsed; + free(regData); + free(reqData); + return ncclSuccess; +fail: + localRegBufUsed = false; + goto exit; +} + +ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { + ncclResult_t ret = ncclSuccess; + bool localRegBufUsed = false; + struct ncclNvlsMcHandleList* sendRecord = NULL; + struct ncclNvlsMcHandleList* recvRecord = NULL; + CUdeviceptr regSendPtr = 0; + CUdeviceptr regRecvPtr = 0; + CUmulticastObjectProp prop; + char shareableHandle[NVLS_HANDLE_SIZE]; + CUmemGenericAllocationHandle sendMcHandle, recvMcHandle; + size_t sendGran, recvGran; + bool *regBufFlags = NULL; + struct graphRegData *rdata = NULL; + const void *baseSend = NULL; + const void *baseRecv = NULL; + size_t baseSendSize = 1; + size_t baseRecvSize = 1; + + *outRegBufUsed = false; + NCCLCHECKGOTO(ncclCalloc(®BufFlags, comm->localRanks), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&rdata, comm->localRanks), ret, fail); + + if (sendbuffSize > 0 || recvbuffSize > 0) { + /* retrieve base pointer and size */ + if (CUPFN(cuMemGetAddressRange) == nullptr) goto fail; + if (sendbuff != NULL) + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff), ret, fail); + if (recvbuff != NULL) + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff), ret, fail); + + memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp)); + prop.size = baseSendSize; + CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); + prop.size = baseRecvSize; + CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); + + localRegBufUsed = ((uint64_t)baseSend % sendGran != 0 || (uint64_t)baseRecv % recvGran != 0) ? false : true; + regBufFlags[comm->localRank] = localRegBufUsed; + NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regBufFlags, sizeof(bool)), ret, fail); + for (int i = 0; i < comm->localRanks; ++i) + if (regBufFlags[i] == false) goto fail; + + if (sendbuff != NULL) { + /* check send buffer offset and size */ + rdata[comm->localRank].offset = (uintptr_t)sendbuff - (uintptr_t)baseSend; + rdata[comm->localRank].size = baseSendSize; + NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail); + baseSendSize = rdata[0].size; + for (int i = 1; i < comm->localRanks; ++i) { + if (rdata[0].offset != rdata[i].offset) goto fail; + if (baseSendSize > rdata[i].size) baseSendSize = rdata[i].size; + } + if (baseSendSize % sendGran != 0) goto fail; + + prop.size = baseSendSize; + + /* register sendbuff */ + if (comm->localRank == 0) { + NCCLCHECKGOTO(nvlsGroupCreate(comm, &prop, comm->localRank, comm->localRanks, &sendMcHandle, shareableHandle), ret, fail); + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); + } else { + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); + NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &sendMcHandle), ret, fail); + } + + CUCHECKGOTO(cuMulticastAddDevice(sendMcHandle, comm->nvlsResources->dev), ret, fail); + CUCHECKGOTO(cuMulticastBindAddr(sendMcHandle, 0, (CUdeviceptr)baseSend, baseSendSize, 0), ret, fail); + + // Create a VA for the NVLS + CUCHECKGOTO(cuMemAddressReserve(®SendPtr, baseSendSize, sendGran, 0U, 0), ret, fail); + // Map the VA locally + CUCHECKGOTO(cuMemMap(regSendPtr, baseSendSize, 0, sendMcHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess(regSendPtr, baseSendSize, &comm->nvlsResources->accessDesc, 1), ret, fail); + + sendRecord = ncclMemoryPoolAlloc(&comm->memPool_ncclNvlsHandleList, &comm->memPermanent); + sendRecord->mcHandle = sendMcHandle; + sendRecord->ptr = regSendPtr; + sendRecord->dev = comm->nvlsResources->dev; + sendRecord->size = baseSendSize; + } + + if (recvbuff != NULL) { + rdata[comm->localRank].offset = (uintptr_t)recvbuff - (uintptr_t)baseRecv; + rdata[comm->localRank].size = baseRecvSize; + NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail); + baseRecvSize = rdata[0].size; + for (int i = 1; i < comm->localRanks; ++i) { + if (rdata[0].offset != rdata[i].offset) goto fail; + if (baseRecvSize > rdata[i].size) baseRecvSize = rdata[i].size; + } + if (baseRecvSize % recvGran != 0) goto fail; + + prop.size = baseRecvSize; + if (comm->localRank == 0) { + NCCLCHECKGOTO(nvlsGroupCreate(comm, &prop, comm->localRank, comm->localRanks, &recvMcHandle, shareableHandle), ret, fail); + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); + } else { + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); + NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &recvMcHandle), ret, fail); + } + + CUCHECKGOTO(cuMulticastAddDevice(recvMcHandle, comm->nvlsResources->dev), ret, fail); + CUCHECKGOTO(cuMulticastBindAddr(recvMcHandle, 0, (CUdeviceptr)baseRecv, baseRecvSize, 0), ret, fail); + + // Create a VA for the NVLS + CUCHECKGOTO(cuMemAddressReserve(®RecvPtr, baseRecvSize, recvGran, 0U, 0), ret, fail); + // Map the VA locally + CUCHECKGOTO(cuMemMap(regRecvPtr, baseRecvSize, 0, recvMcHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess(regRecvPtr, baseRecvSize, &comm->nvlsResources->accessDesc, 1), ret, fail); + + recvRecord = ncclMemoryPoolAlloc(&comm->memPool_ncclNvlsHandleList, &comm->memPermanent); + recvRecord->mcHandle = recvMcHandle; + recvRecord->ptr = regRecvPtr; + recvRecord->dev = comm->nvlsResources->dev; + recvRecord->size = baseRecvSize; + } + + localRegBufUsed = true; + } + +exit: + if (localRegBufUsed == false) { + if (sendRecord) { + ncclNvlsDeregBuffer(&sendRecord->mcHandle, sendRecord->ptr, sendRecord->dev, sendRecord->size); + ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, sendRecord); + } + + if (recvRecord) { + ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size); + ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, recvRecord); + } + } else { + if (sendRecord) { + *outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend); + ncclIntruQueueEnqueue(&plan->nvlsMcHandleQueue, sendRecord); + } + + if (recvRecord) { + *outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv); + ncclIntruQueueEnqueue(&plan->nvlsMcHandleQueue, recvRecord); + } + + INFO(NCCL_NVLS, "rank %d successfully graph-registered sendbuff %p, recvbuff %p, sendbuff size %ld (register size %ld, sendGran %ld), recvbuff size %ld (register size %ld, recvGran %ld), reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, baseSendSize, sendGran, recvbuffSize, baseRecvSize, recvGran, (void*)regSendPtr, (void*)regRecvPtr); + } + + *outRegBufUsed = localRegBufUsed; + free(regBufFlags); + free(rdata); + /* always return success. */ + return ncclSuccess; +fail: + localRegBufUsed = false; + goto exit; +} + #else /* @@ -413,4 +810,18 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) { return ncclSuccess; } +ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { + *outRegBufUsed = false; + return ncclSuccess; +} + +ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { + *outRegBufUsed = false; + return ncclSuccess; +} + +ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { + return ncclSuccess; +} + #endif /* CUDA_VERSION >= 12010 */ diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 3630233307..3e4dab7e44 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -193,9 +193,13 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v // cuMem API support CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE; CUmemGenericAllocationHandle handle; - NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size)); - CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0)); + if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + // Return the native cuMem handle for later Export/Import via UDS + memcpy(&ipcDesc->cuDesc.data, &handle, sizeof(handle)); + } else { + CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0)); + } #else return ncclInternalError; #endif @@ -215,17 +219,6 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v } ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) { - if (ncclCuMemEnable()) { - // cuMem API support - CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE; - - if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { - int fd = *(int *) &ipcDesc->cuDesc.data; - if (fd <= 0) return ncclInternalError; - (void) close(fd); - } - } - return ncclSuccess; } @@ -242,20 +235,20 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // UDS fd support struct ncclProxyConnector proxyConn; - int fd = *(int *)(&cuDesc->data); - int newFd = -1; + int fd = -1; + // Send cuMem handle to remote for conversion to an fd NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpPeer, &proxyConn)); - NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, &newFd)); - INFO(NCCL_P2P, "UDS converted fd %d -> %d on peer %d", fd, newFd, tpPeer); - CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)newFd, type)); - close(newFd); + NCCLCHECK(ncclProxyClientGetFdBlocking(comm, &proxyConn, &cuDesc->data, &fd)); + INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, tpPeer); + CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type)); + (void) close(fd); } else { CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type)); } CUCHECK(cuMemAddressReserve(&dptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0)); CUCHECK(cuMemMap(dptr, size, /* offset */ 0, handle, /* flags */ 0)); - TRACE(NCCL_P2P, "Imported shareable buffer size %zi handle 0x%lx dptr %p", size, (long)handle, (void*)dptr); + TRACE(NCCL_P2P, "Imported shareable buffer size %zi handle 0x%llx dptr %p", size, handle, (void*)dptr); // Allow access by the local GPU CUmemAccessDesc accessDesc = {}; @@ -263,7 +256,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz accessDesc.location.id = comm->cudaDev; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1)); - TRACE(NCCL_P2P, "Set Access for %p size %zi dev %d", (void*)dptr, size, accessDesc.location.id); + TRACE(NCCL_P2P, "Set Access for %p size %zi on dev %d", (void*)dptr, size, accessDesc.location.id); *devMemPtr = (void *)dptr; #else @@ -294,8 +287,8 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* return ncclSuccess; } -static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) { - if (!ncclCuMemEnable() && myInfo->pidHash == peerInfo->pidHash) { +static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) { + if (myInfo->pidHash == peerInfo->pidHash) { if (peerInfo->cudaDev != myInfo->cudaDev) { // Same PID different GPUs, enable P2P access // Legacy CUDA IPC @@ -307,6 +300,18 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclPeerInfo* myInfo, s peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } +#if CUDART_VERSION >= 11030 + // cuMem API support + if (ncclCuMemEnable()) { + // Allow direct access to the remote buffer from the local GPU + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = myInfo->cudaDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + INFO(NCCL_P2P, "Set Access for buffer %p size %zi on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev); + CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1)); + } +#endif } *devMem = p2pBuff->directPtr; *ipcPtr = NULL; @@ -349,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st if (intermediateRank == -1) { info->rank = myInfo->rank; - if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) { + if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) { resources->type = P2P_DIRECT; send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s", @@ -384,7 +389,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName)); } else { NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); - NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc)); + NCCLCHECK(p2pMap(comm, &send->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc)); } return ncclSuccess; @@ -413,7 +418,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st if (intermediateRank == -1) { info->rank = myInfo->rank; - if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) { + if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) { resources->type = P2P_DIRECT; recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; } else { @@ -437,7 +442,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn)); NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); - NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc)); + NCCLCHECK(p2pMap(comm, &recv->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc)); return ncclSuccess; } @@ -447,7 +452,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co struct ncclRecvMem* remDevMem = NULL; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; - NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc)); + NCCLCHECK(p2pMap(comm, &send->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc)); char* buff = (char*)(remDevMem+1); for (int p=0; pconn.ptrExchange = &resources->sendDevMem->ptrExchange; send->conn.redOpArgExchange = resources->sendDevMem->redOpArgExchange; } + // We must assign the proxyConn's proxyProgress property for proper checking at enqueue-time + send->proxyConn.proxyProgress = p2pTransport.send.proxyProgress; return ncclSuccess; } @@ -495,7 +502,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn recv->conn.tail = &resources->devShm->recvMem.tail; recv->conn.head = &resources->devShm->sendMem.head; } else { - NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc)); + NCCLCHECK(p2pMap(comm, &recv->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc)); struct ncclRecvMem* devMem = resources->recvDevMem; recv->conn.tail = &devMem->tail; diff --git a/src/transport/shm.cc b/src/transport/shm.cc index aed8dd7d9e..5b24429199 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -164,6 +164,10 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co send->conn.tail = &proxyInfo.ceRecvMem->tail; send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo; } + + // We must assign the proxyConn's proxyProgress property for proper checking at enqueue-time + send->proxyConn.proxyProgress = shmTransport.send.proxyProgress; + return ncclSuccess; } @@ -193,6 +197,10 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; recv->conn.tail = &proxyInfo.ceRecvMem->tail; } + + // We must assign the proxyConn's proxyProgress property for proper checking at enqueue-time + recv->proxyConn.proxyProgress = shmTransport.recv.proxyProgress; + return ncclSuccess; }