2.20.3-1
Add support for alternating rings, allow for cross-nic rings without cross-rail communication. Add support for user buffer registration for network send/recv. Optimize aggregated operations to better utilize all channels. Add flattening for BCM PCI gen5 switches. Add support for inter-node NVLink communication Add support for port fusion in NET/IB. Add support for ReduceScatter and AllGather using Collnet. Update net API to v8. Fix hang during A2A connection.
Šī revīzija ir iekļauta:
@@ -17,13 +17,14 @@
|
||||
#define NCCL_PTR_DMABUF 0x4
|
||||
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 8
|
||||
#define NCCL_NET_MAX_REQUESTS 32
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
#include "net_v8.h"
|
||||
#include "net_v7.h"
|
||||
#include "net_v6.h"
|
||||
#include "net_v5.h"
|
||||
|
||||
@@ -26,6 +26,7 @@ typedef struct {
|
||||
int needsProxyProgress;
|
||||
} ncclNetDeviceHandle_v7_t;
|
||||
|
||||
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
|
||||
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
#ifndef NCCL_NET_V6_H_
|
||||
#define NCCL_NET_V6_H_
|
||||
|
||||
#define NCCL_NET_MAX_REQUESTS_V6 8
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
|
||||
@@ -22,8 +22,6 @@ typedef struct {
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
} ncclNetProperties_v7_t;
|
||||
|
||||
typedef ncclNetProperties_v7_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_NET_V8_H_
|
||||
#define NCCL_NET_V8_H_
|
||||
|
||||
#include "net_device.h"
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int regIsGlobal; // regMr is not tied to a particular comm
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
} ncclNetProperties_v8_t;
|
||||
|
||||
typedef ncclNetProperties_v8_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
} ncclNet_v8_t;
|
||||
|
||||
#endif // end include guard
|
||||
+100
-26
@@ -15,15 +15,37 @@ __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess;
|
||||
|
||||
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v7_t* props) {
|
||||
//pluginPciPath(dev, &props.pciPath);
|
||||
//pluginPtrSupport(dev, &props.ptrSupport);
|
||||
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
// Below are default values, if unsure don't change.
|
||||
|
||||
props->name = "Example";
|
||||
// Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
|
||||
props->pciPath = NULL;
|
||||
// Only used to detect NICs with multiple PCI attachments.
|
||||
props->guid = 0;
|
||||
// Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
|
||||
props->ptrSupport = NCCL_PTR_HOST;
|
||||
// If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
|
||||
props->regIsGlobal = 0;
|
||||
// Speed in *Mbps*. 100000 means 100G
|
||||
props->speed = 100000;
|
||||
// Port number, used in conjunction with guid
|
||||
props->port = 0;
|
||||
// Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
|
||||
props->latency = 0;
|
||||
// Maximum number of comm objects we can create.
|
||||
props->maxComms = 1024*1024;
|
||||
// Maximum number of receive operations taken by irecv().
|
||||
props->maxRecvs = 1;
|
||||
// Coupling with NCCL network device-side code.
|
||||
props->netDeviceType = 0;
|
||||
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
|
||||
return ncclInternalError;
|
||||
}
|
||||
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
|
||||
__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; }
|
||||
@@ -38,7 +60,7 @@ __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_m
|
||||
|
||||
#define PLUGIN_NAME "Plugin"
|
||||
|
||||
const ncclNet_v7_t ncclNetPlugin_v7 = {
|
||||
const ncclNet_v8_t ncclNetPlugin_v8 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.devices = pluginDevices,
|
||||
@@ -60,10 +82,62 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
|
||||
.irecvConsumed = pluginIrecvConsumed,
|
||||
};
|
||||
|
||||
__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props) {
|
||||
//pluginPciPath(dev, &props.pciPath);
|
||||
//pluginPtrSupport(dev, &props.ptrSupport);
|
||||
return ncclInternalError;
|
||||
__hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) {
|
||||
ncclNetProperties_t props;
|
||||
ncclResult_t ret = pluginGetProperties(dev, &props);
|
||||
if (ret != ncclSuccess) return ret;
|
||||
props_v7->name = props.name;
|
||||
props_v7->pciPath = props.pciPath;
|
||||
props_v7->guid = props.guid;
|
||||
props_v7->ptrSupport = props.ptrSupport;
|
||||
props_v7->speed = props.speed;
|
||||
props_v7->port = props.port;
|
||||
props_v7->maxComms = props.maxComms;
|
||||
props_v7->maxRecvs = props.maxRecvs;
|
||||
props_v7->netDeviceType = props.netDeviceType;
|
||||
props_v7->netDeviceVersion = props.netDeviceVersion;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int type, void** mhandle) {
|
||||
return pluginRegMr(collComm, data, size, type, mhandle);
|
||||
}
|
||||
|
||||
const ncclNet_v7_t ncclNetPlugin_v7 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v7,
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect,
|
||||
.accept = pluginAccept,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.regMrDmaBuf = pluginRegMrDmaBuf,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend,
|
||||
.irecv = pluginIrecv,
|
||||
.iflush = pluginIflush,
|
||||
.test = pluginTest,
|
||||
.closeSend = pluginCloseSend,
|
||||
.closeRecv = pluginCloseRecv,
|
||||
.closeListen = pluginCloseListen,
|
||||
.getDeviceMr = pluginGetDeviceMr,
|
||||
.irecvConsumed = pluginIrecvConsumed,
|
||||
};
|
||||
|
||||
__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props_v6) {
|
||||
ncclNetProperties_t props;
|
||||
ncclResult_t ret = pluginGetProperties(dev, &props);
|
||||
if (ret != ncclSuccess) return ret;
|
||||
props_v6->name = props.name;
|
||||
props_v6->pciPath = props.pciPath;
|
||||
props_v6->guid = props.guid;
|
||||
props_v6->ptrSupport = props.ptrSupport;
|
||||
props_v6->speed = props.speed;
|
||||
props_v6->port = props.port;
|
||||
props_v6->maxComms = props.maxComms;
|
||||
props_v6->maxRecvs = props.maxRecvs;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; }
|
||||
@@ -77,7 +151,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v6,
|
||||
.accept = pluginAccept_v6,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.regMrDmaBuf = pluginRegMrDmaBuf,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend,
|
||||
@@ -98,7 +172,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v6,
|
||||
.accept = pluginAccept_v6,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend,
|
||||
.irecv = pluginIrecv,
|
||||
@@ -110,17 +184,17 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
|
||||
};
|
||||
|
||||
/* v4 Compat */
|
||||
static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) {
|
||||
ncclNetProperties_v6_t props_v6;
|
||||
ncclResult_t ret = pluginGetProperties_v6(dev, &props_v6);
|
||||
static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props_v4) {
|
||||
ncclNetProperties_t props;
|
||||
ncclResult_t ret = pluginGetProperties(dev, &props);
|
||||
if (ret != ncclSuccess) return ret;
|
||||
props->name = props_v6.name;
|
||||
props->pciPath = props_v6.pciPath;
|
||||
props->guid = props_v6.guid;
|
||||
props->ptrSupport = props_v6.ptrSupport;
|
||||
props->speed = props_v6.speed;
|
||||
props->port = props_v6.port;
|
||||
props->maxComms = props_v6.maxComms;
|
||||
props_v4->name = props.name;
|
||||
props_v4->pciPath = props.pciPath;
|
||||
props_v4->guid = props.guid;
|
||||
props_v4->ptrSupport = props.ptrSupport;
|
||||
props_v4->speed = props.speed;
|
||||
props_v4->port = props.port;
|
||||
props_v4->maxComms = props.maxComms;
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
|
||||
@@ -157,7 +231,7 @@ const ncclNet_v4_t ncclNetPlugin_v4 = {
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v4,
|
||||
.accept = pluginAccept_v4,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend_v4,
|
||||
.irecv = pluginIrecv_v4,
|
||||
@@ -202,7 +276,7 @@ const ncclNet_v3_t ncclNetPlugin_v3 = {
|
||||
.listen = pluginListen_v3,
|
||||
.connect = pluginConnect_v3,
|
||||
.accept = pluginAccept_v4,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend_v4,
|
||||
.irecv = pluginIrecv_v4,
|
||||
@@ -223,7 +297,7 @@ const ncclNet_v2_t ncclNetPlugin_v2 = {
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v4,
|
||||
.accept = pluginAccept_v4,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend_v4,
|
||||
.irecv = pluginIrecv_v4,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 19
|
||||
NCCL_PATCH := 4
|
||||
NCCL_MINOR := 20
|
||||
NCCL_PATCH := 3
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
+1
-1
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
|
||||
INCEXPORTS := nccl.h nccl_net.h
|
||||
LIBSRCFILES := \
|
||||
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
|
||||
init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
|
||||
init.cc init_nvtx.cc net.cc proxy.cc transport.cc register.cc \
|
||||
$(wildcard graph/*.cc) \
|
||||
$(wildcard misc/*.cc) \
|
||||
$(wildcard transport/*.cc)
|
||||
|
||||
+14
-6
@@ -221,6 +221,7 @@ struct bootstrapState {
|
||||
struct ncclSocket ringSendSocket;
|
||||
union ncclSocketAddress* peerCommAddresses;
|
||||
union ncclSocketAddress* peerProxyAddresses;
|
||||
uint64_t* peerProxyAddressesUDS;
|
||||
struct unexConn* unexpectedConnections;
|
||||
int cudaDev;
|
||||
int rank;
|
||||
@@ -295,6 +296,7 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
|
||||
|
||||
// Create the service proxy
|
||||
NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
|
||||
NCCLCHECK(ncclCalloc(&state->peerProxyAddressesUDS, nranks));
|
||||
|
||||
// proxy is aborted through a message; don't set abortFlag
|
||||
NCCLCHECK(ncclCalloc(&proxySocket, 1));
|
||||
@@ -302,7 +304,10 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
|
||||
NCCLCHECK(ncclSocketListen(proxySocket));
|
||||
NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank));
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
|
||||
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
|
||||
// cuMem UDS support
|
||||
state->peerProxyAddressesUDS[rank] = getPidHash()+comm->commHash;
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)));
|
||||
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
||||
|
||||
@@ -355,8 +360,6 @@ ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm*
|
||||
for (int i = 0; i < nranks; ++i) {
|
||||
comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
|
||||
}
|
||||
comm->proxyState = parent->sharedRes->proxyState;
|
||||
ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
|
||||
} else {
|
||||
// Create the service proxy
|
||||
NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
|
||||
@@ -366,10 +369,14 @@ ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm*
|
||||
NCCLCHECKGOTO(ncclSocketGetAddr(proxySocket, &tmpAddr), ret, fail);
|
||||
memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail);
|
||||
NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses), ret, fail);
|
||||
// cuMem UDS support
|
||||
NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail);
|
||||
state->peerProxyAddressesUDS[rank] = getPidHash()+comm->commHash;
|
||||
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)), ret, fail);
|
||||
NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT, "bootstrapSplit: rank %d nranks %d color %d key %d prev %d next %d - DONE", rank, nranks, color, key, prev, next);
|
||||
INFO(NCCL_INIT, "bootstrapSplit: comm %p parent %p rank %d nranks %d color %d key %d prev %d next %d - DONE", comm, parent, rank, nranks, color, key, prev, next);
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
@@ -568,7 +575,7 @@ ncclResult_t bootstrapClose(void* commState) {
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
if (state->unexpectedConnections != NULL) {
|
||||
unexpectedFree(state);
|
||||
if (*state->abortFlag == 0) {
|
||||
if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) {
|
||||
WARN("Unexpected connections are not empty");
|
||||
return ncclInternalError;
|
||||
}
|
||||
@@ -592,6 +599,7 @@ ncclResult_t bootstrapAbort(void* commState) {
|
||||
NCCLCHECK(ncclSocketClose(&state->ringRecvSocket));
|
||||
free(state->peerCommAddresses);
|
||||
free(state->peerProxyAddresses);
|
||||
free(state->peerProxyAddressesUDS);
|
||||
free(state);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -191,6 +191,9 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
va_start(vargs, fmt);
|
||||
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
|
||||
va_end(vargs);
|
||||
// vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
|
||||
// Rewind len so that we can replace the final \0 by \n
|
||||
if (len > sizeof(buffer)) len = sizeof(buffer)-1;
|
||||
buffer[len++] = '\n';
|
||||
fwrite(buffer, 1, len, ncclDebugFile);
|
||||
}
|
||||
|
||||
+181
-48
@@ -12,63 +12,50 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const int *ringRanks = ring->userRanks;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*int(chunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t channelCount = args->workCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
const size_t count = args->count;
|
||||
size_t offset;
|
||||
size_t dataOffset;
|
||||
int nelem;
|
||||
int rankDest;
|
||||
|
||||
T *inputBuf = (T*)args->sendbuff;
|
||||
T *outputBuf = (T*)args->recvbuff;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
|
||||
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t realChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset,nChannels));
|
||||
realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
else if (Proto::Id == NCCL_PROTO_LL)
|
||||
realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
|
||||
else if (Proto::Id == NCCL_PROTO_LL128)
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
|
||||
realChunkSize = int(realChunkSize);
|
||||
|
||||
ssize_t chunkOffset = gridOffset + int(bid*realChunkSize);
|
||||
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
dataOffset = gridOffset + elemOffset;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ringRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
offset = dataOffset + rankDest * count;
|
||||
|
||||
if (inputBuf + chunkOffset == outputBuf + offset) { // In place
|
||||
prims.directSend(chunkOffset, offset, nelem);
|
||||
if (inputBuf + dataOffset == outputBuf + offset) { // In place
|
||||
prims.directSend(dataOffset, offset, nelem);
|
||||
} else {
|
||||
prims.directCopySend(chunkOffset, offset, nelem);
|
||||
prims.directCopySend(dataOffset, offset, nelem);
|
||||
}
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ringRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
offset = dataOffset + rankDest * count;
|
||||
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
rankDest = ringRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
offset = dataOffset + rankDest * count;
|
||||
|
||||
// Final wait/copy.
|
||||
prims.directRecv(offset, nelem);
|
||||
@@ -102,13 +89,14 @@ template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t count = args->count;
|
||||
const ssize_t rank = ncclShmem.comm.rank;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
size_t gridOffset = args->workOffset;
|
||||
size_t channelCount = args->workCount;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
const int nThreadsBcast = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
|
||||
const int nThreadsGather = args->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
|
||||
@@ -122,10 +110,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.gather(offset, nvls->nHeads * size, nelem, size, -1, 0);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndBcast) {
|
||||
// Bcast through NVLS
|
||||
@@ -133,9 +121,9 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -150,7 +138,7 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
/* used as sync */
|
||||
prims.scatter(0, 0, 0, 0, -1, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
prims.gather(0, 0, 0, 0, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndBcast) {
|
||||
@@ -161,13 +149,158 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
/* used as sync */
|
||||
prims.recv(0, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t inpOffset = gridOffset + bid * chunkSize;
|
||||
ssize_t outOffset = inpOffset + rank * size;
|
||||
int nelem = min(chunkSize, size - inpOffset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
ssize_t inpOffset = gridOffset + elemOffset;
|
||||
ssize_t outOffset = inpOffset + rank * count;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directSend(inpOffset, outOffset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
|
||||
template<bool BcastSendNotRecv>
|
||||
struct Scatterer {
|
||||
struct ncclWorkElem* args;
|
||||
ssize_t chunkSize;
|
||||
ssize_t railGridOffset;
|
||||
|
||||
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
|
||||
__device__ __forceinline__ void operator()(
|
||||
int tid, int tn, int slice, int maxSliceSize,
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
|
||||
) {
|
||||
static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
|
||||
static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
|
||||
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
int nNodes = ncclShmem.comm.nNodes;
|
||||
int nRails = direct->nHeads;
|
||||
int bid = args->bid;
|
||||
char* inbuf = (char*)args->sendbuff;
|
||||
char* outbuf = (char*)args->recvbuff;
|
||||
ssize_t sizePerRank = args->count*sizeof(T);
|
||||
bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank);
|
||||
|
||||
ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
|
||||
int railAllSize = railAllEnd - railAllBeg;
|
||||
if (tid < nDsts) dstSizes[tid] = railAllSize;
|
||||
|
||||
int src = 0;
|
||||
int rail;
|
||||
if (BcastSendNotRecv) {
|
||||
rail = direct->headRank;
|
||||
} else {
|
||||
rail = direct->headRank+1;
|
||||
if (rail == nRails) rail = 0;
|
||||
}
|
||||
do {
|
||||
int node = railAllBeg/sizePerRank;
|
||||
int railAllOffset = 0;
|
||||
while (railAllOffset < railAllSize) {
|
||||
ssize_t railOneBeg = node*sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg;
|
||||
int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
|
||||
int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
|
||||
ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
|
||||
int outIsDst = (inPlace && rank == ncclShmem.comm.rank) ? 0 : 1;
|
||||
reduceCopy<ncclCollUnroll(), RedOp, T,
|
||||
/*MultimemSrcs,MinSrcs,MaxSrcs=*/0,1,1,
|
||||
/*MultimemDsts=*/0, 0+MinDsts, 1+MaxDsts,
|
||||
/*PreOpSrcs=*/0>
|
||||
(tid, tn, 0, nullptr, false,
|
||||
/*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
|
||||
return (char*)srcPtrs[src] + railAllOffset;
|
||||
},
|
||||
/*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* {
|
||||
return d < outIsDst ? outbuf + userOneBeg
|
||||
: (char*)dstPtrs[d-outIsDst] + railAllOffset;
|
||||
},
|
||||
delta);
|
||||
railAllOffset += delta;
|
||||
node += 1;
|
||||
}
|
||||
src += 1;
|
||||
rail += 1;
|
||||
if (rail == nRails) rail = 0;
|
||||
} while (!BcastSendNotRecv && src < nRails-1);
|
||||
}
|
||||
};
|
||||
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
int tid = threadIdx.x;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
int const &nNodes = ncclShmem.comm.nNodes;
|
||||
ssize_t chunkSize = int(args->chunkCount);
|
||||
ssize_t const &sizePerRank = args->count;
|
||||
|
||||
bool isMultiRail = (direct->nHeads > 1);
|
||||
int nWarps1 = 1;
|
||||
int nWarps2 = (isMultiRail ? 2 : 1);
|
||||
int nWarps3 = (isMultiRail ? 2 : 0);
|
||||
float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
|
||||
nWarps3 = int(denom*nWarps3);
|
||||
nWarps2 = int(denom*nWarps2);
|
||||
nWarps1 = args->nWarps - (nWarps2+nWarps3);
|
||||
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
|
||||
int tn = nWarps1*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 1: send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, nullptr, &direct->out, args->sendbuff, nullptr,
|
||||
/*redOpArg=*/0, 0*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
ssize_t railAllBeg = railGridOffset + args->bid*chunkSize;
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railOneBeg = ncclShmem.comm.node*sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
ssize_t beg = max(railAllBeg, railOneBeg);
|
||||
ssize_t end = min(railAllEnd, railOneEnd);
|
||||
prims.send(beg-railOneBeg, max(ssize_t(0), end-beg));
|
||||
}
|
||||
return;
|
||||
}
|
||||
tid -= tn;
|
||||
|
||||
tn = nWarps2*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 2: Recv network -> deposit output + send to bcast
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, &direct->out, direct->heads+1, nullptr, nullptr,
|
||||
/*redOpArg=*/0, 1*Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*BcastSendNotRecv=*/true> scat;
|
||||
scat.args = args;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
}
|
||||
return;
|
||||
}
|
||||
tid -= tn;
|
||||
|
||||
tn = nWarps3*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 3: Recv bcast -> deposit output
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, direct->heads+1, nullptr, nullptr, nullptr,
|
||||
/*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*BcastSendNotRecv=*/false> scat;
|
||||
scat.args = args;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.process</*Recv=*/1, /*Send=*/0>(scat);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
+216
-188
@@ -12,84 +12,69 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
int ringIx = ring->index;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
|
||||
ssize_t chunkCount = args->chunkCount;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*nranks*chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
|
||||
int minChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_LL)
|
||||
minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T));
|
||||
if (Proto::Id == NCCL_PROTO_LL128) {
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2;
|
||||
}
|
||||
const ssize_t loopCount = nranks * chunkCount;
|
||||
ssize_t offset;
|
||||
ssize_t gridOffset = args->workOffset;
|
||||
ssize_t channelCount = args->workCount;
|
||||
int nelem;
|
||||
int chunk;
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
|
||||
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t realChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*nranks));
|
||||
realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
else
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize);
|
||||
realChunkSize = int(realChunkSize);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t remCount = channelCount - elemOffset;
|
||||
ssize_t chunkOffset;
|
||||
|
||||
if (remCount < loopCount) chunkCount = args->lastChunkCount;
|
||||
|
||||
auto calcOffset = [&]__device__(int chunk)->ssize_t {
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE)
|
||||
return gridOffset + bid*nranks*realChunkSize + chunk*realChunkSize;
|
||||
else
|
||||
return gridOffset + (chunk*nChannels + bid)*realChunkSize;
|
||||
};
|
||||
auto modRanks = [&]__device__(int r)->int {
|
||||
return r - (r >= nranks ? nranks : 0);
|
||||
};
|
||||
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int chunk;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
chunk = modRanks(ringIx + nranks-1);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
chunk = modRanks(ringIx + nranks - 1);
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
prims.send(offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
chunk = modRanks(ringIx + nranks-j);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
for (int j = 2; j < nranks; ++j) {
|
||||
chunk = modRanks(ringIx + nranks - j);
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
chunk = ringIx + 0;
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
prims.directRecvReduceCopySend(offset, offset, nelem, /*postOp=*/true);
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
chunk = modRanks(ringIx + nranks-j);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
for (int j = 1; j < nranks - 1; ++j) {
|
||||
chunk = modRanks(ringIx + nranks - j);
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
chunk = modRanks(ringIx + 1);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
prims.directRecv(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -97,43 +82,35 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
ncclTree *tree = &ncclShmem.channel.tree;
|
||||
ssize_t chunkSize = int(
|
||||
Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize
|
||||
/* LL & LL128 */ : Proto::calcBytePerStep()/sizeof(T));
|
||||
const ssize_t minChunkSize = int(
|
||||
Proto::Id == NCCL_PROTO_SIMPLE ? (nthreads-2*WARP_SIZE)*8*(sizeof(uint64_t)/sizeof(T))
|
||||
/* LL & LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
|
||||
const ssize_t loopSize = int(nChannels*chunkSize);
|
||||
const ssize_t size = args->count;
|
||||
|
||||
if (loopSize > size)
|
||||
chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize);
|
||||
const size_t channelCount = args->workCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
{ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
|
||||
(tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
if (tree->up == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
|
||||
}
|
||||
}
|
||||
else if (tree->down[0] == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -143,23 +120,23 @@ namespace {
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0> prims
|
||||
(tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
if (tree->up == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directSendFromOutput(offset, nelem);
|
||||
}
|
||||
}
|
||||
else if (tree->down[0] == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecv(offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -169,19 +146,13 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
ncclTree *tree = &ncclShmem.channel.tree;
|
||||
ssize_t chunkSize = int(
|
||||
Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize
|
||||
: Proto::calcBytePerStep()/sizeof(T));
|
||||
const ssize_t minChunkSize = int(
|
||||
Proto::Id == NCCL_PROTO_SIMPLE ? (nthreads - 2*WARP_SIZE)*8*(sizeof(uint64_t)/sizeof(T)) :
|
||||
Proto::Id == NCCL_PROTO_LL ? nthreads*(Proto::calcBytePerGrain()/sizeof(T))
|
||||
/* LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T))/8);
|
||||
const ssize_t loopSize = int(nChannels*chunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
const size_t channelCount = args->workCount;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
int nthreadsSplit;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
@@ -193,16 +164,13 @@ namespace {
|
||||
nthreadsSplit = (nthreads*7/(10*WARP_SIZE))*WARP_SIZE;
|
||||
}
|
||||
|
||||
if (loopSize > size)
|
||||
chunkSize = divUp((int)size, nChannels*int(minChunkSize))*int(minChunkSize);
|
||||
|
||||
if (tree->up == -1) {
|
||||
// Reduce and broadcast. Max number of recv is 2, max number of send is 2
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_TREE_ARITY_TOP>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true);
|
||||
}
|
||||
}
|
||||
@@ -218,16 +186,16 @@ namespace {
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth);
|
||||
if (tree->down[0] == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -238,16 +206,16 @@ namespace {
|
||||
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, 1*Proto::MaxGroupWidth);
|
||||
if (tree->down[0] == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecv(offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -282,7 +250,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t chunkSize = args->chunkCount;
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t loopSize = nChannels*direct->nHeads*chunkSize;
|
||||
|
||||
@@ -378,14 +346,10 @@ template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
ssize_t chunkSize = args->chunkCount;
|
||||
const bool hasOut = nvls->out != -1;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
|
||||
const int bcastWarps = hasOut ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
|
||||
const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
|
||||
@@ -401,62 +365,114 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
const int tidEndReduce = tidEndGather + nThreadsReduce;
|
||||
const int tidEndBcast = tidEndReduce + nThreadsBcast;
|
||||
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndGather) {
|
||||
// Gather
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce && nvls->headRank != -1) {
|
||||
if (!hasOut) {
|
||||
if (args->oneNode) {
|
||||
const ssize_t loopCount = nvls->nHeads * chunkSize;
|
||||
const ssize_t channelCount = args->workCount;
|
||||
const ssize_t gridOffset = args->workOffset;
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndGather) {
|
||||
// Gather
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce) {
|
||||
// Reduce, broadcast through NVLS
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
} else {
|
||||
// Reduce, send to network
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t chunkOffset;
|
||||
if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
|
||||
chunkOffset = elemOffset + nvls->headRank * chunkSize;
|
||||
offset = gridOffset + chunkOffset;
|
||||
nelem = min(chunkSize, channelCount - chunkOffset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
} else if (tid < tidEndBcast && nvls->headRank != -1) {
|
||||
// Recv from network, broadcast
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
} else {
|
||||
const int bid = args->bid;
|
||||
const ssize_t loopSize = args->nChannels * nvls->nHeads * chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndGather) {
|
||||
// Gather
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce && nvls->headRank != -1) {
|
||||
if (!hasOut) {
|
||||
// Reduce, broadcast through NVLS
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
} else {
|
||||
// Reduce, send to network
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
} else if (tid < tidEndBcast && nvls->headRank != -1) {
|
||||
// Recv from network, broadcast
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -466,14 +482,13 @@ template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
const int treeUp = nvls->treeUp;
|
||||
const int* treeDown = nvls->treeDown;
|
||||
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
|
||||
ssize_t chunkCount = args->chunkCount;
|
||||
const ssize_t loopCount = nvls->nHeads * chunkCount;
|
||||
const ssize_t channelCount = args->workCount;
|
||||
const ssize_t gridOffset = args->workOffset;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const bool hasUp = treeUp != -1;
|
||||
const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
|
||||
@@ -481,6 +496,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
|
||||
const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
|
||||
const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
|
||||
const int nThreadsScatter = scatterWarps*WARP_SIZE;
|
||||
const int nThreadsGather = gatherWarps*WARP_SIZE;
|
||||
@@ -497,10 +514,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.scatter(offset, nelem, chunkCount, chunkCount, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndGather) {
|
||||
// Gather
|
||||
@@ -508,10 +526,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.gather(offset, nelem, chunkCount, chunkCount, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce && nvls->headRank != -1) {
|
||||
if (!hasUp) {
|
||||
@@ -520,9 +539,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t chunkOffset;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
chunkOffset = elemOffset + nvls->headRank * chunkCount;
|
||||
offset = gridOffset + chunkOffset;
|
||||
nelem = min(chunkCount, channelCount - chunkOffset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
} else {
|
||||
@@ -531,9 +553,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t chunkOffset;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
chunkOffset = elemOffset + nvls->headRank * chunkCount;
|
||||
offset = gridOffset + chunkOffset;
|
||||
nelem = min(chunkCount, channelCount - chunkOffset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -543,9 +568,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t chunkOffset;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
chunkOffset = elemOffset + nvls->headRank * chunkCount;
|
||||
offset = gridOffset + chunkOffset;
|
||||
nelem = min(chunkCount, channelCount - chunkOffset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -560,7 +588,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclTree *tree = &ncclShmem.channel.collnetChain;
|
||||
ssize_t chunkSize = int(args->lastChunkSize);
|
||||
ssize_t chunkSize = args->chunkCount;
|
||||
const ssize_t loopSize = int(nChannels*chunkSize);
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t size = args->count;
|
||||
|
||||
@@ -12,37 +12,25 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1));
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
const int rank = ring->userRanks[0];
|
||||
const int nextRank = ring->userRanks[1];
|
||||
const int root = args->root;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t channelCount = args->workCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
T *inputBuf = (T*)args->sendbuff;
|
||||
T *outputBuf = (T*)args->recvbuff;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t realChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
|
||||
realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
else if (Proto::Id == NCCL_PROTO_LL)
|
||||
realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
|
||||
else if (Proto::Id == NCCL_PROTO_LL128)
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
|
||||
realChunkSize = int(realChunkSize);
|
||||
|
||||
ssize_t offset = gridOffset + int(bid*realChunkSize);
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
|
||||
if (rank == root) {
|
||||
if (inputBuf == outputBuf) {
|
||||
|
||||
@@ -25,6 +25,7 @@ struct ncclShmemGroup {
|
||||
union {
|
||||
unpackGroupShmem unpack;
|
||||
} devicePlugin;
|
||||
int32_t dstSizes[NCCL_MAX_NVLS_ARITY+1];
|
||||
};
|
||||
|
||||
struct ncclShmemData {
|
||||
|
||||
@@ -28,11 +28,11 @@ inline __device__ int loadInt(int* ptr) {
|
||||
template<typename RedFn, typename T, int Unroll, int BytePerPack,
|
||||
int MultimemSrcs, int MinSrcs, int MaxSrcs,
|
||||
int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
|
||||
typename IntBytes>
|
||||
typename IntBytes, typename SrcPtrFn, typename DstPtrFn>
|
||||
__device__ __forceinline__ void reduceCopyPacks(
|
||||
int nThreads, int &thread,
|
||||
uint64_t redArg, uint64_t *preOpArgs, bool postOp,
|
||||
int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs,
|
||||
int nSrcs, SrcPtrFn const &srcPtrFn, int nDsts, DstPtrFn const &dstPtrFn,
|
||||
IntBytes &nBytesBehind, IntBytes &nBytesAhead
|
||||
) {
|
||||
static_assert(std::is_signed<IntBytes>::value, "IntBytes must be a signed integral type.");
|
||||
@@ -66,10 +66,10 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
uintptr_t minDsts[MinDsts + !MinDsts];
|
||||
#pragma unroll
|
||||
for (int s=0; s < MinSrcs; s++)
|
||||
minSrcs[s] = cvta_to_global(srcPtrs[s]) + threadBytesBehind;
|
||||
minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
|
||||
#pragma unroll
|
||||
for (int d=0; d < MinDsts; d++)
|
||||
minDsts[d] = cvta_to_global(dstPtrs[d]) + threadBytesBehind;
|
||||
minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
|
||||
|
||||
// We dictate loop termination condition according to whether partial hunks
|
||||
// can be handled or not.
|
||||
@@ -114,7 +114,7 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
}
|
||||
|
||||
for (int s=MinSrcs; (MinSrcs < MaxSrcs) && (s < MaxSrcs) && (s < nSrcs); s++) {
|
||||
uintptr_t src = cvta_to_global(srcPtrs[s]) + threadBytesBehind;
|
||||
uintptr_t src = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
|
||||
BytePack<BytePerPack> tmp[Unroll];
|
||||
RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
|
||||
#pragma unroll Unroll
|
||||
@@ -149,7 +149,7 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
}
|
||||
}
|
||||
for (int d=MinDsts; (MinDsts < MaxDsts) && (d < MaxDsts) && (d < nDsts); d++) {
|
||||
uintptr_t dst = cvta_to_global(dstPtrs[d]) + threadBytesBehind;
|
||||
uintptr_t dst = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
|
||||
#pragma unroll Unroll
|
||||
for (int u=0; u < Unroll; u++) {
|
||||
st_global<BytePerPack>(dst, acc[u]);
|
||||
@@ -183,11 +183,11 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
template<int Unroll, typename RedFn, typename T,
|
||||
int MultimemSrcs, int MinSrcs, int MaxSrcs,
|
||||
int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
|
||||
typename IntBytes>
|
||||
typename IntBytes, typename SrcPtrFn, typename DstPtrFn>
|
||||
__device__ __forceinline__ void reduceCopy(
|
||||
int thread, int nThreads,
|
||||
uint64_t redArg, uint64_t *preOpArgs, bool postOp,
|
||||
int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs,
|
||||
int nSrcs, SrcPtrFn const &srcPtrFn, int nDsts, DstPtrFn const &dstPtrFn,
|
||||
IntBytes nElts
|
||||
) {
|
||||
static_assert(MultimemSrcs <= MinSrcs && MultimemDsts <= MinDsts, "Multimem pointers cannot exceed respective Min values.");
|
||||
@@ -198,6 +198,9 @@ __device__ __forceinline__ void reduceCopy(
|
||||
// is supported for this redfn/type.
|
||||
constexpr int BigPackSize = (MultimemSrcs == 0) ? 16 : LoadMultimem_BigPackSize<RedFn>::BigPackSize;
|
||||
|
||||
if (MaxDsts==0) return;
|
||||
if (MinDsts==0 && nDsts==0) return;
|
||||
|
||||
IntBytes nBytesBehind = 0;
|
||||
IntBytes nBytesAhead = nElts*sizeof(T);
|
||||
|
||||
@@ -208,20 +211,20 @@ __device__ __forceinline__ void reduceCopy(
|
||||
#endif
|
||||
// Check that all pointers are BigPackSize aligned.
|
||||
bool aligned = true;
|
||||
if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane]) % (BigPackSize + !BigPackSize);
|
||||
if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane]) % (BigPackSize + !BigPackSize);
|
||||
if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrFn(lane)) % (BigPackSize + !BigPackSize);
|
||||
if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrFn(lane)) % (BigPackSize + !BigPackSize);
|
||||
aligned = __all_sync(~0u, aligned);
|
||||
if (aligned) {
|
||||
reduceCopyPacks<RedFn, T, Unroll, BigPackSize,
|
||||
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
|
||||
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
|
||||
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
if (nBytesAhead == 0) return;
|
||||
|
||||
reduceCopyPacks<RedFn, T, /*Unroll=*/1, BigPackSize,
|
||||
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
|
||||
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
|
||||
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
if (nBytesAhead == 0) return;
|
||||
}
|
||||
}
|
||||
@@ -229,13 +232,31 @@ __device__ __forceinline__ void reduceCopy(
|
||||
reduceCopyPacks<RedFn, T, Unroll*(16/sizeof(T))/2, /*BytePerPack=*/sizeof(T),
|
||||
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
|
||||
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
|
||||
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
if (nBytesAhead == 0) return;
|
||||
|
||||
reduceCopyPacks<RedFn, T, /*Unroll=*/1, /*BytePerPack=*/sizeof(T),
|
||||
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
|
||||
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
|
||||
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
}
|
||||
|
||||
template<int Unroll, typename RedFn, typename T,
|
||||
int MultimemSrcs, int MinSrcs, int MaxSrcs,
|
||||
int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
|
||||
typename IntBytes>
|
||||
__device__ __forceinline__ void reduceCopy(
|
||||
int thread, int nThreads,
|
||||
uint64_t redArg, uint64_t *preOpArgs, bool postOp,
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs,
|
||||
IntBytes nElts
|
||||
) {
|
||||
reduceCopy<Unroll, RedFn, T,
|
||||
MultimemSrcs, MinSrcs, MaxSrcs,
|
||||
MultimemDsts, MinDsts, MaxDsts, PreOpSrcs, IntBytes>
|
||||
(thread, nThreads, redArg, preOpArgs, postOp,
|
||||
nSrcs, [=]__device__(int i) { return srcPtrs[i]; },
|
||||
nDsts, [=]__device__(int i) { return dstPtrs[i]; }, nElts);
|
||||
}
|
||||
|
||||
#endif // COMMON_KERNEL_H_
|
||||
|
||||
@@ -74,11 +74,11 @@ else:
|
||||
################################################################################
|
||||
|
||||
algos_of_coll = {
|
||||
"AllGather": ["RING","NVLS"],
|
||||
"AllGather": ["RING","COLLNET_DIRECT","NVLS"],
|
||||
"AllReduce": all_algos,
|
||||
"Broadcast": ["RING"],
|
||||
"Reduce": ["RING"],
|
||||
"ReduceScatter": ["RING","NVLS"],
|
||||
"ReduceScatter": ["RING","COLLNET_DIRECT","NVLS"],
|
||||
"SendRecv": [None]
|
||||
}
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
uint64_t recvConnHead;
|
||||
|
||||
struct ncclConnInfo* sendConn = NULL;
|
||||
volatile int* sendConnFifoPtr = NULL;
|
||||
volatile struct ncclConnFifo* sendConnFifo = NULL;
|
||||
volatile uint64_t* sendConnHeadPtr = NULL;
|
||||
uint64_t sendConnHead;
|
||||
uint64_t sendConnHeadCache; // Cache last seen value
|
||||
@@ -68,9 +68,9 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
sendConnHeadCache = *sendConnHeadPtr;
|
||||
if (checkAbort(spins, 1)) break;
|
||||
}
|
||||
if (sendConnFifoPtr) {
|
||||
if (sendConnFifo) {
|
||||
int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
|
||||
sendConnFifoPtr[sendConnHead%NCCL_STEPS] = size;
|
||||
sendConnFifo[sendConnHead%NCCL_STEPS].size = size;
|
||||
}
|
||||
sendConnHead += 1;
|
||||
}
|
||||
@@ -315,7 +315,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
sendConnHeadPtr = sendConn->head;
|
||||
sendConnHeadCache = *sendConnHeadPtr;
|
||||
sendConnHead = sendConn->step;
|
||||
sendConnFifoPtr = sendConn->sizesFifo;
|
||||
sendConnFifo = sendConn->connFifo;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -323,7 +323,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
__device__ Primitives(
|
||||
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
|
||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
|
||||
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
|
||||
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
|
||||
):
|
||||
redOp(redOpArg),
|
||||
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
|
||||
|
||||
@@ -30,7 +30,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
||||
uint64_t recvConnHead;
|
||||
|
||||
struct ncclConnInfo* sendConn = NULL;
|
||||
volatile int* sendConnFifoPtr = NULL;
|
||||
volatile struct ncclConnFifo* sendConnFifo = NULL;
|
||||
volatile uint64_t* sendConnTailPtr = NULL;
|
||||
uint64_t sendConnTail;
|
||||
volatile uint64_t* sendConnHeadPtr = NULL;
|
||||
@@ -71,8 +71,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
||||
sendConnHeadCache = *sendConnHeadPtr;
|
||||
if (checkAbort(spins, wid, 1)) break;
|
||||
}
|
||||
if (sendConnFifoPtr) {
|
||||
sendConnFifoPtr[sendStep[wid]%NCCL_STEPS] = nbytes;
|
||||
if (sendConnFifo) {
|
||||
sendConnFifo[sendStep[wid]%NCCL_STEPS].size = nbytes;
|
||||
}
|
||||
sendConnHead += 1;
|
||||
}
|
||||
@@ -350,10 +350,10 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
||||
sendConnHeadPtr = sendConn->head;
|
||||
sendConnHeadCache = *sendConnHeadPtr;
|
||||
sendConnHead = sendConn->step;
|
||||
sendConnFifoPtr = sendConn->sizesFifo;
|
||||
sendConnFifo = sendConn->connFifo;
|
||||
}
|
||||
if (tid >= nthreads-WARP_SIZE && wid<fan.nsend()) {
|
||||
if (sendConn->sizesFifo) {
|
||||
if (sendConn->connFifo) {
|
||||
sendConnTailPtr = sendConn->tail;
|
||||
sendConnTail = sendConn->step;
|
||||
}
|
||||
|
||||
@@ -20,8 +20,8 @@ class Primitives<
|
||||
RolePostSend = 0x10,
|
||||
RolePostRecv = 0x20,
|
||||
Aborted = 0x40,
|
||||
OffsFifoEnabled = 0x80,
|
||||
SizesFifoEnabled = 0x100,
|
||||
UserBufferMode = 0x80,
|
||||
ConnFifoEnabled = 0x100,
|
||||
DirectWrite = 0x200,
|
||||
DirectRead = 0x400,
|
||||
ThreadsSynced = 0x800,
|
||||
@@ -39,15 +39,12 @@ class Primitives<
|
||||
int flags;
|
||||
int group;
|
||||
uint64_t step;
|
||||
int *connOffsFifoPtr; // (flags & OffsFifoEnabled)
|
||||
struct ncclConnFifo* connFifo = NULL;
|
||||
union {
|
||||
T *userBuff; // (flags & (RoleInput|RoleOutput))
|
||||
T *connEltsFifo; // !(flags & (RoleInput|RoleOutput))
|
||||
};
|
||||
union {
|
||||
int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled)
|
||||
T *directBuff; // !(flags & SizesFifoEnabled)
|
||||
};
|
||||
T *directBuff;
|
||||
uint64_t *connStepPtr;
|
||||
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
|
||||
void* mhandle;
|
||||
@@ -141,14 +138,16 @@ class Primitives<
|
||||
}
|
||||
|
||||
if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
|
||||
if (isSendNotRecv && (flags & SizesFifoEnabled))
|
||||
connSizesFifoPtr[step%NCCL_STEPS] = nelts*sizeof(T);
|
||||
if (flags & ConnFifoEnabled)
|
||||
connFifo[step%NCCL_STEPS].size = nelts*sizeof(T);
|
||||
|
||||
void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
|
||||
: (ncclShmem.groups[group].srcs + Src);
|
||||
if (flags & OffsFifoEnabled)
|
||||
ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
|
||||
else if (isSendNotRecv && DirectSend) {
|
||||
if (flags & UserBufferMode) {
|
||||
// Do nothing
|
||||
} else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
|
||||
ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T);
|
||||
} else if (isSendNotRecv && DirectSend) {
|
||||
if (flags & (DirectWrite | NvlsDirectWrite)) {
|
||||
ptrs[index] = directBuff + dstIx + offset;
|
||||
} else if (flags & DirectRead) { // empty send
|
||||
@@ -179,7 +178,9 @@ class Primitives<
|
||||
inline __device__ void postPeer(bool dataStored) {
|
||||
if (flags & (Recv*RolePostRecv | Send*RolePostSend)) {
|
||||
step += StepPerSlice;
|
||||
if (Send && (flags & RolePostSend) && dataStored) fence_acq_rel_sys();
|
||||
if (Send && (flags & RolePostSend) && (dataStored||(flags&ConnFifoEnabled))) {
|
||||
fence_acq_rel_sys();
|
||||
}
|
||||
st_relaxed_sys_global(connStepPtr, step);
|
||||
}
|
||||
}
|
||||
@@ -199,7 +200,7 @@ class Primitives<
|
||||
int slice = 0;
|
||||
int offset = 0;
|
||||
|
||||
if (tid < nworkers && offset < nelem) {
|
||||
if (tid < nworkers && offset < nelem && ((flags & UserBufferMode) == 0)) {
|
||||
// Worker-only loop for non-empty slices. Non-workers and empty slices are
|
||||
// processed in the loop following this if block. The benefit of splitting
|
||||
// the loop like this is we pull two branches out of the critical path.
|
||||
@@ -301,6 +302,55 @@ class Primitives<
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
template<int Recv, int Send, typename Fn>
|
||||
__device__ __forceinline__ void process(Fn &&fn) {
|
||||
#pragma unroll 1
|
||||
for (int slice=0; slice < SlicePerChunk; slice++) {
|
||||
if (tid < nworkers) {
|
||||
if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
|
||||
bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
|
||||
int spins = 0;
|
||||
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
if (checkAbort(spins)) break;
|
||||
}
|
||||
void **ptrs = isSendNotRecv ? ncclShmem.groups[group].dsts
|
||||
: ncclShmem.groups[group].srcs;
|
||||
if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
|
||||
int offset = loadInt(&connFifo[step%NCCL_STEPS].offset);
|
||||
ptrs[index] = connEltsFifo + offset/sizeof(T);
|
||||
} else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
|
||||
}
|
||||
}
|
||||
subBarrier();
|
||||
fn.template operator()<SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend>
|
||||
(tid, nworkers, slice, stepSize*StepPerSlice,
|
||||
fan.nrecv(), ncclShmem.groups[group].srcs,
|
||||
fan.nsend(), ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes);
|
||||
}
|
||||
barrier();
|
||||
int32_t dstSize = 0;
|
||||
if (flags & Send*RolePostSend) {
|
||||
dstSize = ncclShmem.groups[group].dstSizes[index];
|
||||
ncclShmem.groups[group].dstSizes[index] = 0;
|
||||
if (flags & ConnFifoEnabled) connFifo[step%NCCL_STEPS].size = dstSize*sizeof(T);
|
||||
}
|
||||
barrier();
|
||||
if (flags & (Recv*(RoleWaitRecv|RolePostRecv) | Send*(RoleWaitSend|RolePostSend))) {
|
||||
step += StepPerSlice;
|
||||
}
|
||||
if (flags & (Recv*RolePostRecv | Send*RolePostSend)) {
|
||||
if (Send && (!Recv || (flags & RolePostSend)) && (dstSize!=0 || (flags&ConnFifoEnabled))) {
|
||||
fence_acq_rel_sys();
|
||||
}
|
||||
st_relaxed_sys_global(connStepPtr, step);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Scatter/Gather generic op
|
||||
// skip: my own rank order in the buffer chunks
|
||||
// shift: peer offset to avoid all ranks sending to or receiving from same peer
|
||||
@@ -386,8 +436,11 @@ class Primitives<
|
||||
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
|
||||
connStepPtr = conn->tail;
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
|
||||
if (Direct) {
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
if (conn->connFifo != nullptr) {
|
||||
flags |= ConnFifoEnabled;
|
||||
connFifo = conn->connFifo;
|
||||
} else if (Direct) {
|
||||
// User buffers have been registered
|
||||
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
|
||||
if (connIndex == 1 && P2p == 0) {
|
||||
@@ -409,9 +462,6 @@ class Primitives<
|
||||
flags |= NvlsDirectRead;
|
||||
}
|
||||
}
|
||||
if (flags & OffsFifoEnabled)
|
||||
connOffsFifoPtr = conn->offsFifo;
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -421,6 +471,10 @@ class Primitives<
|
||||
auto *conn = &peer->send[connIndex];
|
||||
step = conn->step;
|
||||
step = roundUp(step, SlicePerChunk*StepPerSlice);
|
||||
|
||||
connFifo = conn->connFifo;
|
||||
if (connFifo != nullptr) flags |= ConnFifoEnabled;
|
||||
|
||||
if (flags & RolePostSend) {
|
||||
connStepPtr = conn->tail;
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
@@ -430,15 +484,8 @@ class Primitives<
|
||||
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
|
||||
connStepPtr = conn->head;
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
|
||||
if (flags & OffsFifoEnabled)
|
||||
connOffsFifoPtr = conn->offsFifo;
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
|
||||
if (conn->sizesFifo != nullptr) {
|
||||
flags |= SizesFifoEnabled;
|
||||
connSizesFifoPtr = conn->sizesFifo;
|
||||
} else if (Direct) {
|
||||
if (connFifo == nullptr && Direct) {
|
||||
// User buffers have been registered
|
||||
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
|
||||
if (connIndex == 1 && P2p == 0) {
|
||||
@@ -468,7 +515,7 @@ class Primitives<
|
||||
__device__ Primitives(
|
||||
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
|
||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
|
||||
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, int stepSize_=0
|
||||
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
|
||||
):
|
||||
tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
|
||||
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
|
||||
@@ -507,6 +554,8 @@ class Primitives<
|
||||
loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
|
||||
loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
|
||||
|
||||
if (p2p && p2p->reg) flags |= UserBufferMode;
|
||||
|
||||
if (barrierAny(flags & NetDeviceUnpack)) {
|
||||
flags |= AnyNetDeviceUnpack;
|
||||
// g == 0 is the first ThreadPerSync # of threads of this warp
|
||||
@@ -533,10 +582,21 @@ class Primitives<
|
||||
auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
|
||||
conns[index]->step = step;
|
||||
}
|
||||
|
||||
if ((flags & UserBufferMode) && (flags & RoleWaitSend)) {
|
||||
// Make sure we wait until the proxy has sent data before we return.
|
||||
// We don't want the next CUDA kernel to overwrite the send buffer which
|
||||
// was accessed directly.
|
||||
uint64_t prevStep = step - StepPerSlice;
|
||||
volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size);
|
||||
while (*ptr != -1);
|
||||
}
|
||||
|
||||
if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) {
|
||||
ncclNetDeviceSaveHead(netDeviceHandle, group);
|
||||
}
|
||||
|
||||
// Make sure all threads are done writing back conn->step and done using
|
||||
// ncclShmem.groups[group]
|
||||
barrier();
|
||||
}
|
||||
|
||||
|
||||
+15
-32
@@ -12,56 +12,39 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
const int prevRank = ring->userRanks[nranks-1];
|
||||
const int root = args->root;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t channelCount = args->workCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
|
||||
auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
|
||||
int realChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
|
||||
realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
else if (Proto::Id == NCCL_PROTO_LL)
|
||||
realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
|
||||
else if (Proto::Id == NCCL_PROTO_LL128)
|
||||
realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
|
||||
return realChunkSize;
|
||||
};
|
||||
|
||||
if (prevRank == root) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = calcChunkSize(gridOffset);
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
}
|
||||
else if (rank == root) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = calcChunkSize(gridOffset);
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = calcChunkSize(gridOffset);
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -616,7 +616,7 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
|
||||
BytePack<2*sizeof(T)> tmp; \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
|
||||
: "l"(addr & -uintptr_t(sizeof(T)))); \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
|
||||
return tmp.half[(addr/sizeof(T))%2]; \
|
||||
} \
|
||||
};
|
||||
@@ -629,11 +629,11 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
|
||||
if (fn.isMinNotMax) { \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
|
||||
: "l"(addr & -uintptr_t(sizeof(T)))); \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
|
||||
} else { \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
|
||||
: "l"(addr & -uintptr_t(sizeof(T)))); \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
|
||||
} \
|
||||
return tmp.half[(addr/sizeof(T))%2]; \
|
||||
} \
|
||||
|
||||
+177
-46
@@ -12,56 +12,43 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const uint32_t nthreads = (uint32_t)args->nWarps * WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
int const *ringRanks = ring->userRanks;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
size_t channelCount = args->workCount;
|
||||
size_t gridOffset = args->workOffset;
|
||||
size_t offset;
|
||||
size_t dataOffset;
|
||||
size_t count = args->count;
|
||||
uint32_t nelem;
|
||||
int rankDest;
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t realChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
|
||||
realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
else if (Proto::Id == NCCL_PROTO_LL)
|
||||
realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
|
||||
else if (Proto::Id == NCCL_PROTO_LL128)
|
||||
realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
|
||||
realChunkSize = int(realChunkSize);
|
||||
|
||||
ssize_t chunkOffset = gridOffset + bid*int(realChunkSize);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
|
||||
dataOffset = gridOffset + elemOffset;
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ringRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
offset = dataOffset + rankDest * count;
|
||||
prims.send(offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ringRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
offset = dataOffset + rankDest * count;
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final result
|
||||
rankDest = ringRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
prims.recvReduceCopy(offset, chunkOffset, nelem, /*postOp=*/true);
|
||||
offset = dataOffset + rankDest * count;
|
||||
prims.recvReduceCopy(offset, dataOffset, nelem, /*postOp=*/true);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -92,14 +79,15 @@ template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t count = args->count;
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
size_t gridOffset = args->workOffset;
|
||||
size_t channelCount = args->workCount;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
/* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync;
|
||||
* if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
|
||||
@@ -116,10 +104,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.scatter(offset, nvls->nHeads * size, nelem, size, -1, 0);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce) {
|
||||
// Reduce through NVLS
|
||||
@@ -127,9 +115,9 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recv(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -140,7 +128,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
prims.scatter(0, 0, 0, 0, -1, 0);
|
||||
}
|
||||
|
||||
@@ -152,10 +140,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, args->recvbuff,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t outOffset = gridOffset + bid * chunkSize;
|
||||
ssize_t inpOffset = outOffset + rank * size;
|
||||
int nelem = min(chunkSize, size - outOffset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
size_t outOffset = gridOffset + elemOffset;
|
||||
size_t inpOffset = outOffset + rank * count;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecvCopy(inpOffset, outOffset, nelem);
|
||||
}
|
||||
|
||||
@@ -165,3 +153,146 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
|
||||
template<bool ReduceSendNotRecv>
|
||||
struct Scatterer {
|
||||
struct ncclWorkElem* args;
|
||||
int chunkSize;
|
||||
ssize_t railGridOffset;
|
||||
|
||||
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
|
||||
__device__ __forceinline__ void operator()(
|
||||
int tid, int tn, int slice, int maxSliceSize,
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
|
||||
) {
|
||||
static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
|
||||
static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
|
||||
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
int nNodes = ncclShmem.comm.nNodes;
|
||||
int nRails = direct->nHeads;
|
||||
int bid = args->bid;
|
||||
void* inbuf = (void*)args->sendbuff;
|
||||
ssize_t sizePerRank = args->count;
|
||||
|
||||
ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
|
||||
int railAllSize = railAllEnd - railAllBeg;
|
||||
if (tid < nDsts) dstSizes[tid] = railAllSize;
|
||||
|
||||
int dst = 0;
|
||||
int rail;
|
||||
if (!ReduceSendNotRecv) {
|
||||
rail = direct->headRank;
|
||||
} else {
|
||||
rail = direct->headRank+1;
|
||||
if (rail == nRails) rail = 0;
|
||||
}
|
||||
do {
|
||||
int node = railAllBeg/sizePerRank;
|
||||
int railAllOffset = 0;
|
||||
while (railAllOffset < railAllSize) {
|
||||
ssize_t railOneBeg = node*sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg;
|
||||
int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
|
||||
int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
|
||||
ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
|
||||
reduceCopy<ncclCollUnroll(), RedOp, T,
|
||||
/*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
|
||||
/*MultimemDsts,MinDsts,MaxDsts=*/0,1,1,
|
||||
/*PreOpSrcs=*/1>
|
||||
(tid, tn, args->redOpArg, &args->redOpArg, false,
|
||||
/*nSrcs=*/1+nSrcs, [=]__device__(int s) {
|
||||
return s==0 ? (T*)inbuf + userOneBeg
|
||||
: (T*)srcPtrs[s-1] + railAllOffset;
|
||||
},
|
||||
/*nDsts=*/1, [=]__device__(int d/*==0*/) {
|
||||
return (T*)dstPtrs[dst] + railAllOffset;
|
||||
},
|
||||
delta);
|
||||
railAllOffset += delta;
|
||||
node += 1;
|
||||
}
|
||||
dst += 1;
|
||||
rail += 1;
|
||||
if (rail == nRails) rail = 0;
|
||||
} while (ReduceSendNotRecv && dst < nRails-1);
|
||||
}
|
||||
};
|
||||
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
int tid = threadIdx.x;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
int const &nNodes = ncclShmem.comm.nNodes;
|
||||
ssize_t chunkSize = int(args->chunkCount);
|
||||
ssize_t sizePerRank = args->count;
|
||||
|
||||
if (direct->out == -1) __trap();
|
||||
bool isMultiRail = (direct->nHeads > 1);
|
||||
int nWarps1 = (isMultiRail ? 2 : 0);
|
||||
int nWarps2 = (isMultiRail ? 2 : 1);
|
||||
int nWarps3 = 1;
|
||||
float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
|
||||
nWarps3 = int(denom*nWarps3);
|
||||
nWarps2 = int(denom*nWarps2);
|
||||
nWarps1 = args->nWarps - (nWarps2+nWarps3);
|
||||
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
|
||||
int tn = nWarps1*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 1: Scatter inputs to peers
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr,
|
||||
args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*ReduceSendNotRecv=*/true> scat;
|
||||
scat.args = args;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.process</*Recv=*/0, /*Send=*/1>(scat);
|
||||
}
|
||||
return;
|
||||
}
|
||||
tid -= tn;
|
||||
|
||||
tn = nWarps2*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 2: Reduce from peers + local input -> send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, direct->heads+1, &direct->out, nullptr, nullptr,
|
||||
args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*ReduceSendNotRecv=*/false> scat;
|
||||
scat.args = args;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
}
|
||||
return;
|
||||
}
|
||||
tid -= tn;
|
||||
|
||||
tn = nWarps3*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 3: recv from network
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, &direct->out, nullptr, nullptr, args->recvbuff,
|
||||
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
ssize_t railAllBeg = railGridOffset + args->bid*chunkSize;
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railOneBeg = ncclShmem.comm.node*sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
ssize_t beg = max(railAllBeg, railOneBeg);
|
||||
ssize_t end = min(railAllEnd, railOneEnd);
|
||||
prims.recv(beg-railOneBeg, max(ssize_t(0), end-beg), /*postOp=*/true);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -26,13 +26,13 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
|
||||
int const peer = args->peer;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
|
||||
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T));
|
||||
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));
|
||||
size_t offset = 0;
|
||||
do {
|
||||
int nelem = min(size_t(chunkSize), count-offset);
|
||||
prims.directSend(offset, offset, nelem);
|
||||
offset += nelem;
|
||||
} while(offset < count);
|
||||
} while(offset < count && args->reg == 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,13 +45,13 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
|
||||
int const peer = args->peer;
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
|
||||
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T));
|
||||
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));
|
||||
size_t offset = 0;
|
||||
do {
|
||||
int nelem = min(size_t(chunkSize), count-offset);
|
||||
prims.directRecv(offset, nelem);
|
||||
offset += nelem;
|
||||
} while(offset < count);
|
||||
} while(offset < count && args->reg == 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+794
-451
Failā izmaiņas netiks attēlotas, jo tās ir par lielu
Ielādēt izmaiņas
+90
-37
@@ -19,6 +19,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
|
||||
int localRanks = comm->topo->nodes[GPU].count;
|
||||
int nChannels = comm->nChannels;
|
||||
|
||||
topoRanks->nvlsHeadNum = 0;
|
||||
for (int c=0; c<nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
channel->ring.prev = channel->ring.next = -1;
|
||||
@@ -30,20 +31,20 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
|
||||
channel->collnetDirect.headRank = -1;
|
||||
channel->collnetDirect.nHeads = 0;
|
||||
channel->collnetDirect.shift = 0;
|
||||
for (int i=0; i<NCCL_MAX_DIRECT_ARITY+1; i++) channel->collnetDirect.heads[i] = -1;
|
||||
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
|
||||
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
|
||||
|
||||
int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
|
||||
int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
|
||||
int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
|
||||
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra+c*localRanks;
|
||||
|
||||
for (int i=0; i<localRanks; i++) {
|
||||
if (ringIntra[i] == rank) {
|
||||
topoRanks->ringRecv[c] = ringIntra[0];
|
||||
topoRanks->ringSend[c] = ringIntra[localRanks-1];
|
||||
channel->ring.prev = (i == 0) ? -1 : ringIntra[i-1];
|
||||
channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
|
||||
topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1];
|
||||
topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1];
|
||||
}
|
||||
if (treeIntra[i] == rank) {
|
||||
int parentIndex = 0;
|
||||
@@ -61,14 +62,28 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
|
||||
channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1];
|
||||
}
|
||||
}
|
||||
topoRanks->ringPrev[c] = channel->ring.prev;
|
||||
topoRanks->ringNext[c] = channel->ring.next;
|
||||
topoRanks->nvlsHeads[c] = nvlsIntra[0];
|
||||
}
|
||||
// Duplicate channels rings/trees
|
||||
// Duplicate channels trees
|
||||
struct ncclChannel* channel0 = comm->channels;
|
||||
struct ncclChannel* channel1 = channel0+nChannels;
|
||||
memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
|
||||
|
||||
// Get nvls heads and the number of heads. Duplicate head is not allowed.
|
||||
for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
|
||||
bool addHead = true;
|
||||
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
|
||||
|
||||
for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
|
||||
if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
|
||||
addHead = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (addHead) {
|
||||
topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
|
||||
}
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -80,26 +95,14 @@ static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ring
|
||||
int* send = ringSend+c*comm->nNodes;
|
||||
int* prev = ringPrev+c*comm->nRanks;
|
||||
int* next = ringNext+c*comm->nRanks;
|
||||
struct ncclChannel* channel0 = comm->channels+c;
|
||||
struct ncclChannel* channel1 = channel0+nChannels;
|
||||
for (int n=0; n<nNodes; n++) {
|
||||
int recvRank = recv[n];
|
||||
int prevSendRank = send[(n-1+nNodes)%nNodes];
|
||||
prev[recvRank] = prevSendRank;
|
||||
if (comm->rank == recvRank) {
|
||||
channel0->ring.prev = prevSendRank;
|
||||
channel1->ring.prev = prevSendRank;
|
||||
}
|
||||
int sendRank = send[n];
|
||||
int nextRecvRank = recv[(n+1)%nNodes];
|
||||
next[sendRank] = nextRecvRank;
|
||||
if (comm->rank == sendRank) {
|
||||
channel0->ring.next = nextRecvRank;
|
||||
channel1->ring.next = nextRecvRank;
|
||||
}
|
||||
}
|
||||
TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next);
|
||||
TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -209,6 +212,15 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
channel->collnetDirect.up[nUp++] = heads[h];
|
||||
sprintf(line+strlen(line), " %d ", heads[h]);
|
||||
}
|
||||
sprintf(line+strlen(line), "heads ");
|
||||
{ // heads[] is the list of heads ordered in head order startubg with self
|
||||
int h0 = (channel->collnetDirect.headRank == -1) ? 0 : channel->collnetDirect.headRank;
|
||||
for (int h1=0; h1 < nHeads; h1++) {
|
||||
int h = (h0+h1)%nHeads;
|
||||
channel->collnetDirect.heads[h1] = heads[h];
|
||||
sprintf(line+strlen(line), " %d ", heads[h]);
|
||||
}
|
||||
}
|
||||
channel->collnetDirect.nHeads = nHeads;
|
||||
channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
|
||||
channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
|
||||
@@ -217,27 +229,22 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
channel->collnetChain.depth = comm->nRanks/comm->nNodes;
|
||||
}
|
||||
for (int c=0; c<comm->nvlsChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
if (channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
|
||||
}
|
||||
free(heads);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct ncclTopoGraph* nvlsGraph) {
|
||||
int nHeads = nvlsGraph->nChannels;
|
||||
static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHeads) {
|
||||
int headRank = -1;
|
||||
for (int h=0; h<nHeads; h++) {
|
||||
if (nvlsGraph->intra[h*comm->localRanks] == comm->rank) headRank = h;
|
||||
}
|
||||
|
||||
if (nHeads == 0) {
|
||||
comm->nvlsChannels = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
for (int c=0; c<comm->nvlsChannels; c++) {
|
||||
for (int h = 0; h < nHeads; h++) {
|
||||
if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h;
|
||||
}
|
||||
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
channel->nvls.nHeads = nHeads;
|
||||
for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
|
||||
@@ -248,8 +255,10 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct nc
|
||||
channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
|
||||
channel->nvls.node = comm->node;
|
||||
channel->nvls.nNodes = comm->nNodes;
|
||||
if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
|
||||
}
|
||||
if (comm->nNodes == 1) return ncclSuccess;
|
||||
// MNNVL: NVLS not yet supported
|
||||
if (comm->nNodes == 1 || comm->MNNVL) return ncclSuccess;
|
||||
|
||||
// Connect Trees
|
||||
int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
|
||||
@@ -290,7 +299,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct nc
|
||||
}
|
||||
// Set prev/next in all channels (NVLS compute channels work
|
||||
// orthogonally to NVLS search channels).
|
||||
for (int c=0; c<comm->nvlsChannels; c++) {
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
channel->nvls.treeUp = treeUp[c%2];
|
||||
channel->nvls.treeDown[0] = channel->nvls.down;
|
||||
@@ -348,12 +357,19 @@ static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev
|
||||
return c;
|
||||
}
|
||||
|
||||
void exchangeValues(int* v0, int* v1) {
|
||||
int tmp = *v1;
|
||||
*v1 = *v0;
|
||||
*v0 = tmp;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs) {
|
||||
// Gather data from all ranks
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
|
||||
int nranks = comm->nRanks;
|
||||
int nNodes = comm->nNodes;
|
||||
int nChannels = comm->nChannels;
|
||||
int minHeadNum = INT_MAX;
|
||||
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
|
||||
@@ -362,6 +378,22 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
|
||||
|
||||
// Alternate rings to avoid crossing rails
|
||||
if (graphs[NCCL_ALGO_RING]->crossNic && (comm->nNodes % 2) == 0 && (nChannels % 2) == 0) {
|
||||
for (int r=0; r<comm->nRanks; r++) {
|
||||
if (comm->rankToNode[r] % 2 == 1) {
|
||||
// Exchange rings
|
||||
for (int c=0; c<nChannels; c+=2) {
|
||||
exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
|
||||
exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
|
||||
exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
|
||||
exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int c=0; c<nChannels;c++) {
|
||||
for (int n=0; n<nNodes; n++) {
|
||||
int r = firstRanks[n];
|
||||
@@ -376,22 +408,36 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
|
||||
}
|
||||
}
|
||||
for (int c=0; c<graphs[NCCL_ALGO_NVLS]->nChannels; c++) {
|
||||
for (int n=0; n<nNodes; n++) {
|
||||
|
||||
for (int n = 0; n < nNodes; n++) {
|
||||
int r = firstRanks[n];
|
||||
if (minHeadNum > allTopoRanks[r]->nvlsHeadNum)
|
||||
minHeadNum = allTopoRanks[r]->nvlsHeadNum;
|
||||
}
|
||||
|
||||
for (int c = 0; c < minHeadNum; c++) {
|
||||
for (int n = 0; n < nNodes; n++) {
|
||||
int r = firstRanks[n];
|
||||
nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c];
|
||||
nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
|
||||
}
|
||||
}
|
||||
|
||||
// Connect rings and trees. This should also duplicate the channels.
|
||||
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
|
||||
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
|
||||
NCCLCHECK(connectNvls(comm, nvlsHeads, graphs[NCCL_ALGO_NVLS]));
|
||||
|
||||
// Duplicate ringPrev/ringNext for ncclBuildRing
|
||||
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
|
||||
memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
|
||||
|
||||
// Set ring prev/next for my rank
|
||||
for (int c=0; c<nChannels; c++) {
|
||||
struct ncclChannel* channel0 = comm->channels+c;
|
||||
struct ncclChannel* channel1 = channel0+nChannels;
|
||||
channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank];
|
||||
channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank];
|
||||
}
|
||||
|
||||
// Duplication should be complete now
|
||||
nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
|
||||
|
||||
@@ -407,7 +453,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
}
|
||||
|
||||
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
|
||||
if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= MAXCHANNELS) {
|
||||
if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
|
||||
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
|
||||
}
|
||||
|
||||
@@ -422,6 +468,13 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext);
|
||||
}
|
||||
|
||||
comm->collChannels = comm->nChannels;
|
||||
// Support maximal channel usage for aggregation
|
||||
if (comm->nChannels < comm->nvlsChannels) {
|
||||
nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
|
||||
}
|
||||
NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
|
||||
|
||||
// Create rings array and check all is fine
|
||||
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
|
||||
|
||||
|
||||
+43
-5
@@ -341,6 +341,23 @@ compare:
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// MNNVL: Check whether peers are in the same fabric cluster and clique
|
||||
ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret) {
|
||||
*ret = 0;
|
||||
|
||||
nvmlGpuFabricInfoV_t *fabricInfo1 = &info1->fabricInfo;
|
||||
nvmlGpuFabricInfoV_t *fabricInfo2 = &info2->fabricInfo;
|
||||
// A zero UUID means we don't have MNNVL fabric info
|
||||
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
|
||||
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
|
||||
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
|
||||
INFO(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
|
||||
info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
|
||||
*ret = 1;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
|
||||
int ncclTopoUserGdrLevel = -1;
|
||||
|
||||
@@ -652,7 +669,8 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
|
||||
}
|
||||
|
||||
if (system->nodes[GPU].count == comm->nRanks) {
|
||||
// MNNVL: Remove network nodes as they are connected via NVLink
|
||||
if (system->nodes[GPU].count == comm->nRanks || comm->MNNVL) {
|
||||
for (int n=system->nodes[NET].count-1; n>=0; n--)
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
|
||||
}
|
||||
@@ -666,10 +684,11 @@ void ncclTopoFree(struct ncclTopoSystem* system) {
|
||||
free(system);
|
||||
}
|
||||
|
||||
NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 2);
|
||||
NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", -1);
|
||||
|
||||
static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
|
||||
static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gpu index*/, int peerRank, int* nChannels) {
|
||||
int peer;
|
||||
struct ncclTopoSystem* system = comm->topo;
|
||||
struct ncclTopoLinkList* path = NULL;
|
||||
if (ncclTopoRankToIndex(system, peerRank, &peer) == ncclSuccess) {
|
||||
// Same rank
|
||||
@@ -685,9 +704,28 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
|
||||
} else {
|
||||
*nChannels = 2;
|
||||
}
|
||||
} else if (comm->MNNVL) {
|
||||
// MNNVL assume all GPUs are connected via NVLink
|
||||
path = system->nodes[GPU].nodes[g].paths[GPU]+((g+1)%system->nodes[GPU].count);
|
||||
float nvlBw = ncclTopoNVLinkBw(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
|
||||
*nChannels = 2*std::max(1, (int)(path->bw / nvlBw));
|
||||
} else {
|
||||
// Remote rank, use network
|
||||
*nChannels = ncclParamNChannelsPerNetPeer();
|
||||
int nNetChannels = ncclParamNChannelsPerNetPeer();
|
||||
if (nNetChannels == -1) {
|
||||
//start from 2 channels per NIC and reduce with scale
|
||||
nNetChannels = 2;
|
||||
|
||||
// check if we need to use more than one NIC, hence more than one channel
|
||||
int netCountByBw = 1, nChannelsMax = nNetChannels;
|
||||
NCCLCHECK(getLocalNetCountByBw(system, g, &netCountByBw));
|
||||
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
|
||||
while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
|
||||
|
||||
//allow upto channels requires to drive the NICs
|
||||
nNetChannels = std::max(netCountByBw, nChannelsMax);
|
||||
}
|
||||
*nChannels = nNetChannels;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -716,7 +754,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
for (int g=0; g<comm->topo->nodes[GPU].count; g++) {
|
||||
for (int r=0; r<comm->nRanks; r++) {
|
||||
int nChannels;
|
||||
NCCLCHECK(ncclTopoGetNchannels(comm->topo, g, r, &nChannels));
|
||||
NCCLCHECK(ncclTopoGetNchannels(comm, g, r, &nChannels));
|
||||
if (nChannels >= 0) minChannels = std::min(minChannels, nChannels);
|
||||
}
|
||||
}
|
||||
|
||||
+25
-11
@@ -372,13 +372,12 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
|
||||
return ncclSuccess;
|
||||
}
|
||||
// 2. Try to get better bandwidth
|
||||
// Give a 15% perf bonus to paths not crossing nics
|
||||
float target = 1.0 - (refGraph->crossNic - graph->crossNic) * .15;
|
||||
if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra*target) {
|
||||
// Give a 5% perf bonus to paths not crossing nics
|
||||
if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) {
|
||||
*copy = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra*target) return ncclSuccess;
|
||||
if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess;
|
||||
|
||||
// 3. Less hops
|
||||
if (graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
|
||||
@@ -484,6 +483,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
|
||||
if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
|
||||
if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
|
||||
|
||||
// Balanced Tree : count half of the bandwidth on first two GPUs
|
||||
int nextBackToNet = -1;
|
||||
@@ -555,6 +555,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
if (graph->collNet && net->net.collSupport == 0) continue;
|
||||
if (net->net.bw < bw) continue;
|
||||
if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue;
|
||||
|
||||
graph->inter[graph->nChannels*2] = net->id;
|
||||
graph->latencyInter = net->net.latency;
|
||||
@@ -1071,16 +1072,29 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
|
||||
|
||||
#include "comm.h"
|
||||
// NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
|
||||
ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int* dev) {
|
||||
ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int channelId, int* dev) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int localRanks = comm->topo->nodes[GPU].count;
|
||||
for (int c=0; c<graph->nChannels; c++) {
|
||||
if (graph->intra[c*localRanks] == comm->rank) {
|
||||
*dev = graph->inter[c*2];
|
||||
return ncclSuccess;
|
||||
int netNum = 0;
|
||||
int net[MAXCHANNELS];
|
||||
|
||||
for (int c = 0; c < graph->nChannels; c++) {
|
||||
if (graph->intra[c * localRanks] == comm->rank) {
|
||||
net[netNum++] = graph->inter[c * 2];
|
||||
}
|
||||
}
|
||||
if (netNum) {
|
||||
*dev = net[channelId % netNum];
|
||||
} else {
|
||||
ret = ncclInternalError;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
|
||||
return ncclInternalError;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
|
||||
@@ -1095,7 +1109,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
|
||||
if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
|
||||
*dev = graph->inter[channel*2+index];
|
||||
} else {
|
||||
NCCLCHECK(getNvlsNetDev(comm, graph, dev));
|
||||
NCCLCHECK(getNvlsNetDev(comm, graph, channelId, dev));
|
||||
}
|
||||
NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
|
||||
} else if (peerRank == -1) {
|
||||
|
||||
+35
-6
@@ -180,12 +180,17 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
|
||||
// even though they're supposed to sustain full BW across all ports.
|
||||
// Flatten the switch as this extra level can break the search and make
|
||||
// NCCL take wrong topology decisions.
|
||||
int getBcmGen(uint64_t id, int level) {
|
||||
if ((id & 0xfffffffffffff000) == 0x1000c0101000a000) return 4;
|
||||
if ((id & 0xfffffffffffff000) == (0x1000c03010000000 | level*0x1000)) return 5;
|
||||
return 0;
|
||||
}
|
||||
ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
for (int s=0; s<system->nodes[PCI].count; s++) {
|
||||
struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
|
||||
uint64_t device = pciSwitch->pci.device;
|
||||
// Only flatten PEX Gen 4 switches in base mode
|
||||
if ((device & 0xfffffffffffff000) == 0x1000c0101000a000) {
|
||||
int gen = getBcmGen(pciSwitch->pci.device, 0);
|
||||
// Flatten Gen4 PEX switches in base mode
|
||||
if (gen) {
|
||||
// Find sub switches with the same device ID.
|
||||
int64_t* subSwIds;
|
||||
NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks));
|
||||
@@ -193,7 +198,7 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
for (int l=0; l<pciSwitch->nlinks; l++) {
|
||||
struct ncclTopoNode* sub = pciSwitch->links[l].remNode;
|
||||
// Only fuse sub switches with the same device ID.
|
||||
if (sub->type != PCI || sub->pci.device != device) continue;
|
||||
if (sub->type != PCI || getBcmGen(sub->pci.device, 1) != gen) continue;
|
||||
// Save sub switch for later
|
||||
subSwIds[subs++] = sub->id;
|
||||
// Remove link to that sub switch
|
||||
@@ -225,8 +230,8 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
}
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
|
||||
}
|
||||
// Set subdevice to 0x0000 to make sure we don't merge this switch again.
|
||||
pciSwitch->pci.device = 0x1000c01010000000;
|
||||
// Set subdevice to 0xffff to make sure we don't merge this switch again.
|
||||
pciSwitch->pci.device |= 0xffff;
|
||||
free(subSwIds);
|
||||
// Restart, as system->nodes[PCI].nodes has changed.
|
||||
s = 0;
|
||||
@@ -732,6 +737,30 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) {
|
||||
int localNetCount = 0, netCountByBw = 0;
|
||||
int* localNets;
|
||||
float totalNetBw = 0, gpuBw = 0;
|
||||
|
||||
for (int l=0; l<system->nodes[GPU].nodes[gpu].nlinks; l++) {
|
||||
//assuming BW to CPU reflects the GPU bandwidth via P2P or C2C
|
||||
//caveat, this could be wrong if there is a PCIe switch,
|
||||
//and a narrower link to the CPU
|
||||
if (system->nodes[GPU].nodes[gpu].links[l].remNode->type == CPU) {
|
||||
gpuBw = system->nodes[GPU].nodes[gpu].links[l].bw;
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
|
||||
for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) {
|
||||
totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw;
|
||||
}
|
||||
*count = netCountByBw;
|
||||
|
||||
free(localNets);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
|
||||
int gpu;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
#define SM60_NVLINK_BW 18.0
|
||||
#define SM70_NVLINK_BW 20.0
|
||||
#define SM80_NVLINK_BW 20.0
|
||||
#define SM90_NVLINK_BW 20.0
|
||||
#define SM90_NVLINK_BW 20.6
|
||||
#define SM86_NVLINK_BW 12.0
|
||||
#define PCI_BW 12.0 // PCI Gen3 x16
|
||||
#define QPI_BW 6.0
|
||||
|
||||
+40
-19
@@ -132,7 +132,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
|
||||
getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
|
||||
|
||||
int nNodes = comm->nNodes;
|
||||
// MNNVL support - treat as a single NVLink connected node
|
||||
int nNodes = comm->MNNVL ? 1 : comm->nNodes;
|
||||
int nRanks = comm->nRanks;
|
||||
if (nRanks <= 1) return ncclSuccess;
|
||||
|
||||
@@ -165,8 +166,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
|
||||
if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
|
||||
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
|
||||
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
|
||||
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
|
||||
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
|
||||
@@ -186,19 +187,38 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
|
||||
// Collnet+Direct requires all GPUs to have a local NIC to work at full speed
|
||||
float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio
|
||||
factor -= (factor-1)/2;
|
||||
busBw /= factor;
|
||||
if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
|
||||
busBw = ppn * bw;
|
||||
// AllGather/ReduceScatter requires 1:1 GPU:NIC
|
||||
int nicPerNode = comm->collNetHeadsUniqueNum;
|
||||
if (coll == ncclFuncAllGather && comm->nNodes > 1) {
|
||||
if (!comm->ncclCollNet || !comm->ncclCollNet->iallgather || ppn > nicPerNode) busBw = 0;
|
||||
}
|
||||
if (coll == ncclFuncReduceScatter && comm->nNodes > 1) {
|
||||
if (!comm->ncclCollNet || !comm->ncclCollNet->ireducescatter || ppn > nicPerNode) busBw = 0;
|
||||
}
|
||||
// Measured corrective ratio needed at 1 ppn and 8ppn. Here we hackishly
|
||||
// interpolate the two.
|
||||
float w = (ppn-1)/(8-1);
|
||||
busBw *= w*0.85 + (1-w)*0.95;
|
||||
} else {
|
||||
// Collnet+Direct requires all GPUs to have a local NIC to work at full speed
|
||||
float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio
|
||||
factor -= (factor-1)/2;
|
||||
busBw /= factor;
|
||||
if (minCompCap >= 90) busBw *= .85;
|
||||
}
|
||||
}
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE && minCompCap >= 90) busBw *= .85;
|
||||
|
||||
// Convert bus BW to algorithm BW
|
||||
float ratio;
|
||||
if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
|
||||
else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio = 5.0/6.0;
|
||||
else ratio = .5;
|
||||
comm->bandwidths[coll][a][p] = busBw * ratio;
|
||||
if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
|
||||
float ratio = 1.0f;
|
||||
if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
|
||||
else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
|
||||
else ratio *= .5;
|
||||
busBw *= ratio;
|
||||
}
|
||||
comm->bandwidths[coll][a][p] = busBw;
|
||||
/* Ring bandwidth backup */
|
||||
if (a == NCCL_ALGO_RING)
|
||||
comm->ringbdw[coll][p] = comm->bandwidths[coll][NCCL_ALGO_RING][p];
|
||||
@@ -262,18 +282,19 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
|
||||
}
|
||||
|
||||
if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
|
||||
// MNNVL: NVLS not yet supported
|
||||
if (comm->nNodes == 1 || comm->MNNVL) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
|
||||
|
||||
// Disable CollNet if it is not supported
|
||||
if (comm->collNetSupport == 0) {
|
||||
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
|
||||
algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
|
||||
if (comm->nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0;
|
||||
// MNNVL: NVLS not yet supported
|
||||
if (comm->nNodes > 1 || comm->MNNVL) algoEnable[NCCL_ALGO_NVLS] = 0;
|
||||
// If user has hard set NCCL_ALGO=COLLNET, ignore it
|
||||
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 &&
|
||||
algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) {
|
||||
algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
|
||||
if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
|
||||
}
|
||||
} else {
|
||||
// Disable CollNet+Direct if not on an NVSwitch system
|
||||
@@ -398,9 +419,9 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup) {
|
||||
float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
|
||||
float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
|
||||
float lat = info->comm->latencies[info->coll][algorithm][protocol];
|
||||
|
||||
|
||||
if (backup) {
|
||||
*backup = false;
|
||||
if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
|
||||
@@ -416,7 +437,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
|
||||
int logSize = log2i(info->nBytes>>6);
|
||||
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
|
||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
|
||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && (!info->comm->MNNVL && info->comm->nNodes > 1)
|
||||
&& info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) {
|
||||
lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
|
||||
}
|
||||
|
||||
@@ -592,8 +592,8 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
|
||||
NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId));
|
||||
for (int s=0; s<parent->nSubs; s++) {
|
||||
const char* busId;
|
||||
NCCLCHECK(xmlGetAttrStr(parent->subs[s], "busid", &busId));
|
||||
if (strcmp(newBusId, busId) < 0) { subIndex = s; break; }
|
||||
NCCLCHECK(xmlGetAttr(parent->subs[s], "busid", &busId));
|
||||
if (busId != NULL && strcmp(newBusId, busId) < 0) { subIndex = s; break; }
|
||||
}
|
||||
for (int s = parent->nSubs; s > subIndex; s--) parent->subs[s] = parent->subs[s-1];
|
||||
parent->subs[subIndex] = pciNode;
|
||||
|
||||
+5
-5
@@ -235,9 +235,9 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
|
||||
// Reset comm->tasks to empty.
|
||||
comm->tasks.nTasksColl = 0;
|
||||
comm->tasks.nTasksP2p = 0;
|
||||
comm->tasks.workBytesTotal = 0;
|
||||
comm->tasks.streams = nullptr;
|
||||
ncclIntruQueueConstruct(&comm->tasks.collQueue);
|
||||
comm->tasks.collBytesTotal = 0;
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
|
||||
ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
|
||||
@@ -321,9 +321,9 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
|
||||
assert(state == ncclGroupJobJoined);
|
||||
}
|
||||
|
||||
if (*groupAbortFlag == true || errorJobAbortFlag == true) {
|
||||
*job->abortFlag = 1;
|
||||
if (job->childAbortFlag) *job->childAbortFlag = 1;
|
||||
if (__atomic_load_n(groupAbortFlag, __ATOMIC_RELAXED) || errorJobAbortFlag == true) {
|
||||
__atomic_store_n(job->abortFlag, 1, __ATOMIC_RELAXED);
|
||||
if (job->childAbortFlag) __atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
job = job->next;
|
||||
@@ -438,7 +438,7 @@ ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {
|
||||
|
||||
ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
|
||||
if (groupJob && groupJob->initialized) {
|
||||
*groupJob->abortFlagPtr = true;
|
||||
__atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELAXED);
|
||||
NCCLCHECK(ncclGroupJobComplete(groupJob));
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -85,13 +85,14 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
|
||||
CUmemAllocationProp prop = {};
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
CUmemGenericAllocationHandle handle;
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
int cudaDev;
|
||||
int flag = 0;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported
|
||||
prop.requestedHandleTypes = type;
|
||||
prop.location.id = currentDev;
|
||||
// Query device to see if RDMA support is available
|
||||
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
|
||||
|
||||
@@ -12,5 +12,6 @@
|
||||
|
||||
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
|
||||
ncclResult_t ArgsCheck(struct ncclInfo* info);
|
||||
ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -19,9 +19,9 @@ static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNet
|
||||
static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
|
||||
static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
|
||||
static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
|
||||
static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
|
||||
/* DMA-BUF support */
|
||||
static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
|
||||
NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
|
||||
|
||||
@@ -45,4 +45,15 @@ inline int ncclTypeSize(ncclDataType_t type) {
|
||||
}
|
||||
}
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
#define NCCL_MODE_NORMAL 0
|
||||
#define NCCL_MODE_OFFSET 1
|
||||
#define NCCL_MODE_PTR 2
|
||||
struct ncclConnFifo {
|
||||
int mode;
|
||||
int offset;
|
||||
ssize_t size;
|
||||
void* ptr;
|
||||
};
|
||||
#endif
|
||||
|
||||
+12
-29
@@ -14,6 +14,7 @@
|
||||
#include "proxy.h"
|
||||
#include "strongstream.h"
|
||||
#include "nccl_net.h"
|
||||
#include "register.h"
|
||||
|
||||
#if CUDART_VERSION < 9000
|
||||
struct cudaLaunchParams {
|
||||
@@ -54,8 +55,7 @@ struct ncclRecvMem {
|
||||
struct {
|
||||
uint64_t tail;
|
||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
int sizesFifo[NCCL_STEPS];
|
||||
int offsFifo[NCCL_STEPS];
|
||||
struct ncclConnFifo connFifo[NCCL_STEPS];
|
||||
int flush; // For GDRCopy-based flush
|
||||
};
|
||||
char pad4[MEM_ALIGN];
|
||||
@@ -169,7 +169,6 @@ struct ncclKernelPlan {
|
||||
// A kernel plan is also a callback that reclaims itself. Hence this must
|
||||
// be the first member.
|
||||
struct ncclCommCallback reclaimer;
|
||||
struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup
|
||||
|
||||
struct ncclComm* comm;
|
||||
struct ncclKernelPlan* next;
|
||||
@@ -200,23 +199,7 @@ struct ncclKernelPlan {
|
||||
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
|
||||
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
|
||||
} channels[MAXCHANNELS];
|
||||
};
|
||||
|
||||
struct ncclRegRequest {
|
||||
uintptr_t buff;
|
||||
size_t size;
|
||||
struct ncclRegRequest *next;
|
||||
};
|
||||
|
||||
struct ncclRegRecord {
|
||||
uintptr_t buff;
|
||||
size_t size;
|
||||
CUdeviceptr regAddr;
|
||||
size_t regSize;
|
||||
int dev;
|
||||
CUmemGenericAllocationHandle mcHandle;
|
||||
uintptr_t *addrs; /* use to check if NVLS buffers match among intra-node ranks */
|
||||
struct ncclRegRecord *next;
|
||||
size_t maxBytesPerChannel;
|
||||
};
|
||||
|
||||
struct ncclComm {
|
||||
@@ -262,6 +245,7 @@ struct ncclComm {
|
||||
int* localRankToRank;
|
||||
// localRanks and localRanktoRank for all nodes
|
||||
struct ncclNodeRanks* nodeRanks;
|
||||
int MNNVL; // MNNVL: Multi-Node NVLink
|
||||
|
||||
bool checkPointers;
|
||||
bool dmaBufSupport;
|
||||
@@ -270,8 +254,9 @@ struct ncclComm {
|
||||
uint64_t opCount;
|
||||
|
||||
// Channels for collectives
|
||||
int nChannels;
|
||||
int nvlsChannels;
|
||||
int nChannels; // connection nChannels
|
||||
int collChannels; // enqueue nChannels
|
||||
int nvlsChannels; // enqueue nChannels
|
||||
int collNetChannels;
|
||||
// Channels (per peer) for p2p
|
||||
int p2pnChannels;
|
||||
@@ -334,6 +319,9 @@ struct ncclComm {
|
||||
int intraHighestTransportType;
|
||||
int* collNetHeads;
|
||||
int collNetHeadsNum;
|
||||
int collNetHeadsUniqueNum;
|
||||
int* collNetDenseToUserRank;
|
||||
int* collNetUserToDenseRank;
|
||||
/* sharable collNet proxy progress resource. */
|
||||
struct ncclCollNetSharedRes* collNetSharedRes;
|
||||
|
||||
@@ -343,8 +331,6 @@ struct ncclComm {
|
||||
/* sharable NVLS resource. */
|
||||
struct ncclNvlsSharedRes* nvlsResources;
|
||||
|
||||
ssize_t channelSize; // User requested work size (bytes) for channel partitions
|
||||
|
||||
// pools backed by comm->memPermanent
|
||||
struct ncclMemoryPool memPool_ncclProxyOp;
|
||||
struct ncclMemoryPool memPool_ncclKernelPlan;
|
||||
@@ -380,13 +366,10 @@ struct ncclComm {
|
||||
// group job to support multi-thread FT
|
||||
struct ncclGroupJob *groupJob;
|
||||
|
||||
/* store to buffer register request */
|
||||
struct ncclIntruQueue<struct ncclRegRequest, &ncclRegRequest::next> regRequestQueue;
|
||||
/* store registered buffer */
|
||||
struct ncclIntruQueue<struct ncclRegRecord, &ncclRegRecord::next> regRecordQueue;
|
||||
|
||||
// Tuning plugin
|
||||
ncclTuner_t* tuner;
|
||||
// buffer registration cache
|
||||
struct ncclRegCache regCache;
|
||||
};
|
||||
|
||||
enum ncclLaunchMode {
|
||||
|
||||
@@ -16,6 +16,10 @@ extern int ncclCuMemEnable();
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
#include <cudaTypedefs.h>
|
||||
|
||||
// Handle type used for cuMemCreate()
|
||||
extern CUmemAllocationHandleType ncclCuMemHandleType;
|
||||
|
||||
#else
|
||||
typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
|
||||
typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
|
||||
|
||||
+67
-45
@@ -96,8 +96,7 @@ struct ncclConnInfo {
|
||||
void **ptrExchange; // Pointer exchange for direct communication
|
||||
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
|
||||
|
||||
int *sizesFifo; // Sizes fifo from GPU to proxy
|
||||
int *offsFifo; // Buffer fifo from proxy to GPU
|
||||
struct ncclConnFifo* connFifo; // Used for GPU - Proxy communication
|
||||
|
||||
uint64_t step; // Keep where we are
|
||||
uint64_t llLastCleaning;
|
||||
@@ -151,6 +150,9 @@ struct ncclDirect {
|
||||
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
|
||||
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
|
||||
int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
|
||||
// The heads[...] are guaranteed to be in rotated order start with self:
|
||||
// headRank, (headRank+1)%nHeads, (headRank+2)%nHeads, ...
|
||||
int heads[NCCL_MAX_DIRECT_ARITY+1];
|
||||
int up[NCCL_MAX_DIRECT_ARITY];
|
||||
int down[NCCL_MAX_DIRECT_ARITY];
|
||||
};
|
||||
@@ -210,21 +212,28 @@ struct ncclWorkElem {
|
||||
union {
|
||||
uint8_t flagBits;
|
||||
struct {
|
||||
uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1;
|
||||
uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, oneNode:1;
|
||||
};
|
||||
};
|
||||
uint8_t nWarps;
|
||||
uint8_t direct;
|
||||
|
||||
const void * sendbuff;
|
||||
void * recvbuff;
|
||||
uint32_t root;
|
||||
const void *sendbuff;
|
||||
void *recvbuff;
|
||||
|
||||
size_t count;
|
||||
size_t lastChunkSize;
|
||||
uint32_t root;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
uint64_t redOpArg;
|
||||
uint64_t chunkCount:25, workCount:39;
|
||||
union {
|
||||
struct {
|
||||
uint64_t lastChunkCount:25;
|
||||
uint64_t workOffset:39;
|
||||
};
|
||||
struct {
|
||||
uint64_t bid:32;
|
||||
uint64_t nChannels:32;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem))
|
||||
@@ -235,7 +244,8 @@ struct ncclWorkElemP2p {
|
||||
int proto : 2;
|
||||
|
||||
enum ncclWorkP2PType p2pType;
|
||||
uint8_t nWarps;
|
||||
uint8_t reg:1;
|
||||
uint8_t nWarps:5;
|
||||
uint8_t warpStart;
|
||||
uint8_t ngroups;
|
||||
// Important not to use any fields with greater than 4-byte alignment since
|
||||
@@ -296,6 +306,8 @@ struct alignas(16) ncclDevChannel {
|
||||
struct ncclDevComm {
|
||||
int rank;
|
||||
int nRanks;
|
||||
int node;
|
||||
int nNodes;
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
int p2pChunkSize;
|
||||
|
||||
@@ -303,6 +315,8 @@ struct ncclDevComm {
|
||||
int workFifoDepth;
|
||||
struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
|
||||
|
||||
int* collNetDenseToUserRank;
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
volatile uint32_t* abortFlag;
|
||||
|
||||
@@ -415,46 +429,54 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
|
||||
#else
|
||||
constexpr int NumTypes = ncclNumTypes + 1;
|
||||
#endif
|
||||
int row;
|
||||
do {
|
||||
row = 0; // ncclDevFuncIndex_P2p
|
||||
if (coll == ncclFuncSendRecv) break;
|
||||
row += 1;
|
||||
|
||||
int row = 0; // ncclDevFuncIndex_P2p
|
||||
if (coll == ncclFuncSendRecv) goto have_row;
|
||||
row += 1;
|
||||
int nAlgos = 3;
|
||||
if (coll == ncclFuncAllGather) {
|
||||
int algo1 = algo == NCCL_ALGO_RING ? 0 :
|
||||
algo == NCCL_ALGO_COLLNET_DIRECT ? 1 :
|
||||
/*algo == NCCL_ALGO_NVLS*/ 2;
|
||||
row += algo1*NCCL_NUM_PROTOCOLS + proto;
|
||||
break;
|
||||
}
|
||||
row += nAlgos*NCCL_NUM_PROTOCOLS;
|
||||
|
||||
if (coll == ncclFuncAllGather) {
|
||||
int algo1 = algo == NCCL_ALGO_RING ? 0 :
|
||||
/*algo == NCCL_ALGO_NVLS*/ 1;
|
||||
row += algo1*NCCL_NUM_PROTOCOLS + proto;
|
||||
goto have_row;
|
||||
}
|
||||
row += (/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS;
|
||||
nAlgos = 1;
|
||||
if (coll == ncclFuncBroadcast) {
|
||||
row += proto;
|
||||
break;
|
||||
}
|
||||
row += nAlgos*NCCL_NUM_PROTOCOLS;
|
||||
|
||||
if (coll == ncclFuncBroadcast) {
|
||||
row += proto;
|
||||
goto have_row;
|
||||
}
|
||||
row += (/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS;
|
||||
nAlgos = NCCL_NUM_ALGORITHMS;
|
||||
if (coll == ncclFuncAllReduce) {
|
||||
row += ((devRedOp*NumTypes + type)*nAlgos + algo)*NCCL_NUM_PROTOCOLS + proto;
|
||||
break;
|
||||
}
|
||||
row += ncclNumDevRedOps*NumTypes*nAlgos*NCCL_NUM_PROTOCOLS;
|
||||
|
||||
if (coll == ncclFuncAllReduce) {
|
||||
row += ((devRedOp*NumTypes + type)*NCCL_NUM_ALGORITHMS + algo)*NCCL_NUM_PROTOCOLS + proto;
|
||||
goto have_row;
|
||||
}
|
||||
row += ncclNumDevRedOps*NumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS;
|
||||
nAlgos = 1;
|
||||
if (coll == ncclFuncReduce) {
|
||||
row += (devRedOp*NumTypes + type)*NCCL_NUM_PROTOCOLS + proto;
|
||||
break;
|
||||
}
|
||||
row += ncclNumDevRedOps*NumTypes*nAlgos*NCCL_NUM_PROTOCOLS;
|
||||
|
||||
if (coll == ncclFuncReduce) {
|
||||
row += (devRedOp*NumTypes + type)*NCCL_NUM_PROTOCOLS + proto;
|
||||
goto have_row;
|
||||
}
|
||||
row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS;
|
||||
nAlgos = 3;
|
||||
if (coll == ncclFuncReduceScatter) {
|
||||
int algo1 = algo == NCCL_ALGO_RING ? 0 :
|
||||
algo == NCCL_ALGO_COLLNET_DIRECT ? 1 :
|
||||
/*algo == NCCL_ALGO_NVLS*/ 2;
|
||||
row += ((devRedOp*NumTypes + type)*nAlgos + algo1)*NCCL_NUM_PROTOCOLS + proto;
|
||||
break;
|
||||
}
|
||||
row += ncclNumDevRedOps*NumTypes*nAlgos*NCCL_NUM_PROTOCOLS;
|
||||
} while (false);
|
||||
|
||||
if (coll == ncclFuncReduceScatter) {
|
||||
int algo1 = algo == NCCL_ALGO_RING ? 0 :
|
||||
/*algo == NCCL_ALGO_NVLS*/ 1;
|
||||
row += ((devRedOp*NumTypes + type)*2 + algo1)*NCCL_NUM_PROTOCOLS + proto;
|
||||
goto have_row;
|
||||
}
|
||||
row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS;
|
||||
|
||||
have_row:
|
||||
return ncclDevFuncRowToId[row];
|
||||
}
|
||||
|
||||
|
||||
@@ -12,8 +12,10 @@
|
||||
#include "collectives.h"
|
||||
#include "utils.h"
|
||||
|
||||
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
|
||||
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
|
||||
#define NCCL_LL_ALIGNMENT_PER_THREAD sizeof(uint64_t)
|
||||
#define NCCL_LL128_ALIGNMENT_PER_WARP 480
|
||||
#define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL)
|
||||
#define NCCL_BYTES_ALIGNMENT 16
|
||||
|
||||
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||
|
||||
@@ -33,6 +33,7 @@ int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
|
||||
ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
|
||||
ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
|
||||
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
|
||||
@@ -53,10 +54,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
|
||||
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
|
||||
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
|
||||
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
|
||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
|
||||
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);
|
||||
|
||||
#define NCCL_TOPO_MAX_NODES 256
|
||||
|
||||
@@ -102,6 +104,7 @@ struct ncclTopoRanks {
|
||||
int treeToChild0[MAXCHANNELS];
|
||||
int treeToChild1[MAXCHANNELS];
|
||||
int nvlsHeads[MAXCHANNELS];
|
||||
int nvlsHeadNum;
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
|
||||
|
||||
+39
-13
@@ -13,6 +13,7 @@
|
||||
#include "core.h"
|
||||
#include "utils.h"
|
||||
#include "strongstream.h"
|
||||
#define NCCL_MAX_LOCAL_RANKS 64
|
||||
|
||||
typedef enum : uint8_t {
|
||||
ncclPatternRing,
|
||||
@@ -30,6 +31,13 @@ typedef enum : uint8_t {
|
||||
ncclPatternRecv
|
||||
} ncclPattern_t;
|
||||
|
||||
enum ncclRegBufferType {
|
||||
NCCL_REGULAR_BUFFER = 0,
|
||||
NCCL_IPC_REG_BUFFER = 1,
|
||||
NCCL_NVLS_REG_BUFFER = 2,
|
||||
NCCL_REG_BUFFER_NUM = 3
|
||||
};
|
||||
|
||||
// Used to pass NCCL call information between functions
|
||||
struct ncclInfo {
|
||||
ncclFunc_t coll;
|
||||
@@ -48,37 +56,46 @@ struct ncclInfo {
|
||||
int sliceSteps;
|
||||
// Computed later
|
||||
ncclDevRedOpFull opFull;
|
||||
int algorithm;
|
||||
int protocol;
|
||||
ncclPattern_t pattern;
|
||||
int nChannels;
|
||||
int nThreads;
|
||||
size_t nBytes;
|
||||
size_t aggnBytes;
|
||||
size_t workBytes;
|
||||
size_t sendbuffSize;
|
||||
size_t recvbuffSize;
|
||||
int nstepsPerLoop;
|
||||
int nchunksPerLoop;
|
||||
int stepSize;
|
||||
int chunkCount;
|
||||
int chunkSize;
|
||||
int channelId;
|
||||
int workFuncIndex;
|
||||
ncclRegBufferType regBufType;
|
||||
void* regBufSend[NCCL_MAX_LOCAL_RANKS];
|
||||
void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
|
||||
// Need to initialize
|
||||
int nThreads;
|
||||
int nChannels;
|
||||
int algorithm;
|
||||
int protocol;
|
||||
bool userTuned;
|
||||
struct ncclInfo *next;
|
||||
};
|
||||
|
||||
inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
|
||||
info->nBytes = info->count * ncclTypeSize(info->datatype);
|
||||
info->nBytes = info->workBytes = info->count * ncclTypeSize(info->datatype);
|
||||
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
|
||||
info->count = info->nBytes;
|
||||
info->count = info->workBytes;
|
||||
info->datatype = ncclInt8;
|
||||
}
|
||||
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
|
||||
|
||||
/* compute buffer size for NVLS buffer registration */
|
||||
if (info->coll == ncclFuncAllGather) {
|
||||
info->sendbuffSize = info->count * ncclTypeSize(info->datatype);
|
||||
info->sendbuffSize = info->workBytes;
|
||||
info->recvbuffSize = info->sendbuffSize * nRanks;
|
||||
} else if (info->coll == ncclFuncReduceScatter) {
|
||||
info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
|
||||
info->recvbuffSize = info->workBytes;
|
||||
info->sendbuffSize = info->recvbuffSize * nRanks;
|
||||
} else {
|
||||
info->sendbuffSize = info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
|
||||
info->sendbuffSize = info->recvbuffSize = info->workBytes;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -93,6 +110,7 @@ struct ncclTaskColl {
|
||||
ncclDataType_t datatype;
|
||||
ncclDevRedOpFull op;
|
||||
int chunkSteps, sliceSteps;
|
||||
struct ncclInfo info;
|
||||
};
|
||||
struct ncclTaskP2p {
|
||||
ncclTaskP2p *next;
|
||||
@@ -113,8 +131,16 @@ struct ncclTasks {
|
||||
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
|
||||
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
|
||||
};
|
||||
struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
|
||||
size_t collBytesTotal;
|
||||
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collQueue;
|
||||
// Queue for user-tuned executed collectives
|
||||
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collTunedQueue;
|
||||
// Queue for continuous bytes distribution (CBD) collectives
|
||||
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collCBDQueue;
|
||||
// Queue for collnet
|
||||
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collnetQueue;
|
||||
size_t workBytesTotal;
|
||||
int usableChannels;
|
||||
bool sorted;
|
||||
struct Peer* peers/*[nRanks]*/;
|
||||
int *p2pSendOrder, *p2pRecvOrder;
|
||||
int p2pOrderSteps;
|
||||
|
||||
@@ -35,4 +35,7 @@ ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd);
|
||||
ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
|
||||
ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
|
||||
|
||||
ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash);
|
||||
ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd);
|
||||
|
||||
#endif /* NCCL_IPCSOCKET_H */
|
||||
|
||||
@@ -13,7 +13,17 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
|
||||
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
|
||||
typedef enum {
|
||||
ncclFuncBroadcast = 0,
|
||||
ncclFuncReduce = 1,
|
||||
ncclFuncAllGather = 2,
|
||||
ncclFuncReduceScatter = 3,
|
||||
ncclFuncAllReduce = 4,
|
||||
ncclFuncSendRecv = 5,
|
||||
ncclFuncSend = 6,
|
||||
ncclFuncRecv = 7,
|
||||
ncclNumFuncs = 8
|
||||
} ncclFunc_t;
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
|
||||
#define NCCL_ALGO_UNDEF -1
|
||||
|
||||
+173
-50
@@ -21,6 +21,140 @@
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 32
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int regIsGlobal; // regMr is not tied to a particular comm
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
} ncclNetProperties_v8_t;
|
||||
|
||||
typedef ncclNetProperties_v8_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
} ncclNet_v8_t;
|
||||
|
||||
typedef ncclNet_v8_t ncclNet_t;
|
||||
|
||||
#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v8
|
||||
|
||||
typedef struct {
|
||||
void* mhandle;
|
||||
void* address;
|
||||
uint32_t size;
|
||||
} ncclNetSGE_v8_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
void* sendMhandle, void** request);
|
||||
ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp,
|
||||
void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v8_t;
|
||||
|
||||
typedef ncclCollNet_v8_t ncclCollNet_t;
|
||||
|
||||
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v8
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
@@ -36,8 +170,6 @@ typedef struct {
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
} ncclNetProperties_v7_t;
|
||||
|
||||
typedef ncclNetProperties_v7_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
@@ -93,11 +225,45 @@ typedef struct {
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
} ncclNet_v7_t;
|
||||
|
||||
typedef ncclNet_v7_t ncclNet_t;
|
||||
|
||||
#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v7
|
||||
|
||||
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v7
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v7_t;
|
||||
|
||||
#define NCCL_NET_MAX_REQUESTS_V6 8
|
||||
|
||||
@@ -162,49 +328,6 @@ typedef struct {
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclNet_v6_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v7_t;
|
||||
|
||||
typedef ncclCollNet_v7_t ncclCollNet_t;
|
||||
|
||||
// v6 struct for backwards compatibility
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
|
||||
@@ -24,6 +24,7 @@ typedef struct {
|
||||
int needsProxyProgress;
|
||||
} ncclNetDeviceHandle_v7_t;
|
||||
|
||||
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
|
||||
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
|
||||
typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -20,6 +20,12 @@
|
||||
// Dynamically handle dependencies on NVML
|
||||
|
||||
/* Extracted from nvml.h */
|
||||
|
||||
#define NVML_API_VERSION 12
|
||||
|
||||
#define NVML_STRUCT_VERSION(data, ver) (unsigned int)(sizeof(nvml ## data ## _v ## ver ## _t) | \
|
||||
(ver << 24U))
|
||||
|
||||
typedef struct nvmlDevice_st* nvmlDevice_t;
|
||||
#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
|
||||
|
||||
@@ -181,6 +187,72 @@ typedef struct nvmlFieldValue_st
|
||||
nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
|
||||
} nvmlFieldValue_t;
|
||||
|
||||
|
||||
#define NVML_GPU_FABRIC_UUID_LEN 16
|
||||
|
||||
#define NVML_GPU_FABRIC_STATE_NOT_SUPPORTED 0
|
||||
#define NVML_GPU_FABRIC_STATE_NOT_STARTED 1
|
||||
#define NVML_GPU_FABRIC_STATE_IN_PROGRESS 2
|
||||
#define NVML_GPU_FABRIC_STATE_COMPLETED 3
|
||||
|
||||
typedef unsigned char nvmlGpuFabricState_t;
|
||||
|
||||
typedef struct {
|
||||
unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs
|
||||
nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete".
|
||||
unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs
|
||||
nvmlGpuFabricState_t state; //!< Current state of GPU registration process
|
||||
} nvmlGpuFabricInfo_t;
|
||||
|
||||
#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED 0
|
||||
#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE 1
|
||||
#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE 2
|
||||
|
||||
#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW 0
|
||||
#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW 0x11
|
||||
|
||||
/**
|
||||
* GPU Fabric Health Status Mask for various fields can be obtained
|
||||
* using the below macro.
|
||||
* Ex - NVML_GPU_FABRIC_HEALTH_GET(var, _DEGRADED_BW)
|
||||
*/
|
||||
#define NVML_GPU_FABRIC_HEALTH_GET(var, type) \
|
||||
(((var) >> NVML_GPU_FABRIC_HEALTH_MASK_SHIFT##type) & \
|
||||
(NVML_GPU_FABRIC_HEALTH_MASK_WIDTH##type))
|
||||
|
||||
/**
|
||||
* GPU Fabric Health Status Mask for various fields can be tested
|
||||
* using the below macro.
|
||||
* Ex - NVML_GPU_FABRIC_HEALTH_TEST(var, _DEGRADED_BW, _TRUE)
|
||||
*/
|
||||
#define NVML_GPU_FABRIC_HEALTH_TEST(var, type, val) \
|
||||
(NVML_GPU_FABRIC_HEALTH_GET(var, type) == \
|
||||
NVML_GPU_FABRIC_HEALTH_MASK##type##val)
|
||||
|
||||
/**
|
||||
* GPU Fabric information (v2).
|
||||
*
|
||||
* Version 2 adds the \ref nvmlGpuFabricInfo_v2_t.version field
|
||||
* to the start of the structure, and the \ref nvmlGpuFabricInfo_v2_t.healthMask
|
||||
* field to the end. This structure is not backwards-compatible with
|
||||
* \ref nvmlGpuFabricInfo_t.
|
||||
*/
|
||||
typedef struct {
|
||||
unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuFabricInfo_v2)
|
||||
unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs
|
||||
nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete".
|
||||
unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs
|
||||
nvmlGpuFabricState_t state; //!< Current state of GPU registration process
|
||||
unsigned int healthMask; //!< GPU Fabric health Status Mask
|
||||
} nvmlGpuFabricInfo_v2_t;
|
||||
|
||||
typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t;
|
||||
|
||||
/**
|
||||
* Version identifier value for \ref nvmlGpuFabricInfo_v2_t.version.
|
||||
*/
|
||||
#define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2)
|
||||
|
||||
/* End of nvml.h */
|
||||
#endif // NCCL_NVML_DIRECT
|
||||
|
||||
@@ -210,5 +282,6 @@ ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
|
||||
ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
|
||||
ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
|
||||
ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
|
||||
ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo);
|
||||
|
||||
#endif // End include guard
|
||||
|
||||
+14
-2
@@ -9,10 +9,22 @@
|
||||
#ifndef NCCL_P2P_H_
|
||||
#define NCCL_P2P_H_
|
||||
|
||||
#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
|
||||
#include <cuda.h>
|
||||
|
||||
typedef struct {
|
||||
#if CUDART_VERSION < 12030
|
||||
// MNNVL: FABRIC handle support lifted from CUDA 12.3
|
||||
#define CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED ((CUdevice_attribute)128)
|
||||
#define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL)
|
||||
#define CU_IPC_HANDLE_SIZE 64
|
||||
typedef struct CUmemFabricHandle_st {
|
||||
unsigned char data[CU_IPC_HANDLE_SIZE];
|
||||
} CUmemFabricHandle_v1;
|
||||
typedef CUmemFabricHandle_v1 CUmemFabricHandle;
|
||||
#endif
|
||||
|
||||
typedef union {
|
||||
uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support
|
||||
CUmemFabricHandle handle;
|
||||
} ncclCuDesc;
|
||||
|
||||
typedef union {
|
||||
|
||||
+43
-17
@@ -24,33 +24,42 @@ typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclP
|
||||
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
|
||||
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
|
||||
|
||||
union ncclProxyOpSpecifics {
|
||||
struct {
|
||||
size_t sizePerRank;
|
||||
int nNodes, node;
|
||||
} collnetDirect;
|
||||
};
|
||||
|
||||
struct ncclProxyOp {
|
||||
struct ncclProxyConnection* connection;
|
||||
int channelId;
|
||||
int nsteps;
|
||||
void* buffer;
|
||||
ssize_t nbytes;
|
||||
uint64_t opCount;
|
||||
int root;
|
||||
int next;
|
||||
|
||||
uint64_t opCount;
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int nsteps;
|
||||
int chunkSize;
|
||||
uint8_t sliceSteps;
|
||||
uint8_t chunkSteps;
|
||||
uint8_t channelId;
|
||||
uint8_t /*ncclDataType_t*/ dtype;
|
||||
uint8_t /*ncclDevRedOp_t*/ redOp;
|
||||
uint8_t /*ncclFunc_t*/ coll;
|
||||
uint8_t /*ncclPattern_t*/ pattern;
|
||||
uint8_t protocol;
|
||||
uint8_t reg;
|
||||
|
||||
union {
|
||||
uint64_t unused;
|
||||
// For use by enqueue.cc
|
||||
struct ncclProxyOp *enqNext;
|
||||
};
|
||||
union ncclProxyOpSpecifics specifics;
|
||||
|
||||
struct ncclProxyOp *enqNext;
|
||||
};
|
||||
static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
|
||||
|
||||
struct ncclProxySubArgs {
|
||||
struct ncclProxyConnection* connection;
|
||||
int reg;
|
||||
void* buffer;
|
||||
void* mhandle;
|
||||
int channelId;
|
||||
int nsteps;
|
||||
ssize_t nbytes;
|
||||
@@ -82,6 +91,7 @@ struct ncclProxyArgs {
|
||||
uint8_t /*ncclDataType_t*/ dtype;
|
||||
uint8_t /*ncclDevRedOp_t*/ redOp;
|
||||
uint8_t /*ncclPattern_t*/ pattern;
|
||||
uint8_t /*ncclFunc_t*/ coll;
|
||||
uint8_t protocol;
|
||||
int state;
|
||||
char* sharedBuff[NCCL_STEPS];
|
||||
@@ -93,6 +103,8 @@ struct ncclProxyArgs {
|
||||
struct ncclProxyArgs* next;
|
||||
struct ncclProxyArgs* nextPeer;
|
||||
struct ncclProxyArgs** proxyAppendPtr;
|
||||
|
||||
union ncclProxyOpSpecifics specifics;
|
||||
};
|
||||
#define NCCL_MAX_NETDEVS 128
|
||||
|
||||
@@ -100,7 +112,7 @@ struct ncclProxyArgs {
|
||||
// Make sure we have enough to store two full rounds of operations on all channels.
|
||||
// Otherwise we'd be unable to post half of them to free new elements.
|
||||
#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P)
|
||||
#define NCCL_MAX_LOCAL_RANKS 64
|
||||
|
||||
struct ncclProxyOpsPool {
|
||||
struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
|
||||
volatile int nextOps;
|
||||
@@ -193,6 +205,16 @@ struct ncclProxyRpcResponseHeader {
|
||||
int respSize;
|
||||
};
|
||||
|
||||
// UDS support
|
||||
struct ncclIpcHdr {
|
||||
int type;
|
||||
int rank;
|
||||
int reqSize;
|
||||
int respSize;
|
||||
void *opId;
|
||||
uint64_t data[16]; // 128-bytes
|
||||
};
|
||||
|
||||
struct ncclProxyState {
|
||||
int refCount;
|
||||
int tpRank;
|
||||
@@ -208,9 +230,11 @@ struct ncclProxyState {
|
||||
ncclNet_t* ncclNet;
|
||||
ncclCollNet_t* ncclCollNet;
|
||||
volatile uint32_t* abortFlag;
|
||||
// Service thread
|
||||
// Service threads
|
||||
pthread_t thread;
|
||||
pthread_t threadUDS;
|
||||
struct ncclSocket* listenSock;
|
||||
struct ncclIpcSocket ipcSock;
|
||||
int stop;
|
||||
CUcontext cudaCtx;
|
||||
ncclResult_t asyncResult;
|
||||
@@ -221,6 +245,7 @@ struct ncclProxyState {
|
||||
struct ncclProxyOps* proxyOps;
|
||||
void** sharedDevMems;
|
||||
struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)
|
||||
uint64_t *peerAddressesUDS; // cuMem API support (UDS)
|
||||
|
||||
// Progress thread
|
||||
struct ncclProxyProgressState progressState;
|
||||
@@ -262,9 +287,9 @@ enum proxyMode {
|
||||
};
|
||||
|
||||
ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
|
||||
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
|
||||
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp, int reg);
|
||||
ncclResult_t ncclProxyStart(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
|
||||
enum ncclProxyMsgType {
|
||||
@@ -288,7 +313,8 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector
|
||||
ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
|
||||
ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
|
||||
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd);
|
||||
// UDS support
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int rank, void *handle, int* convertedFd);
|
||||
|
||||
ncclResult_t ncclProxyStop(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
#ifndef NCCL_REGISTER_H_
|
||||
#define NCCL_REGISTER_H_
|
||||
|
||||
enum {
|
||||
NET_REG_COMPLETE = 0x01,
|
||||
NVLS_REG_COMPLETE = 0x02,
|
||||
NVLS_REG_POSSIBLE = 0x04,
|
||||
NVLS_REG_NO_SUPPORT = 0x08
|
||||
};
|
||||
|
||||
struct ncclReg {
|
||||
// common attributes
|
||||
size_t pages;
|
||||
int refs;
|
||||
uintptr_t addr;
|
||||
uint32_t state;
|
||||
// net reg
|
||||
int nDevs;
|
||||
int devs[MAXCHANNELS];
|
||||
void** handles;
|
||||
// nvls reg
|
||||
uintptr_t baseAddr;
|
||||
size_t baseSize;
|
||||
CUdeviceptr regAddr;
|
||||
size_t regSize;
|
||||
int dev;
|
||||
CUmemGenericAllocationHandle mcHandle;
|
||||
uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
|
||||
};
|
||||
|
||||
struct ncclRegCache {
|
||||
struct ncclReg **slots;
|
||||
int capacity, population;
|
||||
uintptr_t pageSize;
|
||||
void* sComms[MAXCHANNELS];
|
||||
void* rComms[MAXCHANNELS];
|
||||
};
|
||||
|
||||
ncclResult_t ncclRegCleanup(struct ncclComm* comm);
|
||||
ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
|
||||
|
||||
#endif
|
||||
@@ -18,6 +18,7 @@ struct ncclShmemCollBuff {
|
||||
volatile size_t *cnt[2];
|
||||
volatile void *ptr[2];
|
||||
int round;
|
||||
size_t maxTypeSize;
|
||||
};
|
||||
|
||||
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
|
||||
|
||||
@@ -43,6 +43,8 @@ struct ncclPeerInfo {
|
||||
int64_t busId;
|
||||
struct ncclComm* comm;
|
||||
int cudaCompCap;
|
||||
// MNNVL support
|
||||
nvmlGpuFabricInfoV_t fabricInfo;
|
||||
};
|
||||
|
||||
#define CONNECT_SIZE 128
|
||||
|
||||
@@ -30,6 +30,11 @@ uint64_t getHostHash();
|
||||
uint64_t getPidHash();
|
||||
ncclResult_t getRandomData(void* buffer, size_t bytes);
|
||||
|
||||
const char* ncclOpToString(ncclRedOp_t op);
|
||||
const char* ncclDatatypeToString(ncclDataType_t type);
|
||||
const char* ncclAlgoToString(int algo);
|
||||
const char* ncclProtoToString(int proto);
|
||||
|
||||
struct netIf {
|
||||
char prefix[64];
|
||||
int port;
|
||||
@@ -394,6 +399,36 @@ void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
|
||||
}
|
||||
}
|
||||
|
||||
/* cmp function determines the sequence of objects in the queue. If cmp returns value >= 0, it means a > b,
|
||||
* and we should put a before b; otherwise, b should be put ahead of a. */
|
||||
template<typename T, T *T::*next>
|
||||
inline void ncclIntruQueueSortEnqueue(ncclIntruQueue<T,next> *me, T *x, int (*cmp)(T *a, T *b)) {
|
||||
T *cur = me->head;
|
||||
T *prev = NULL;
|
||||
|
||||
if (cur == NULL) {
|
||||
x->*next = nullptr;
|
||||
me->tail = me->head = x;
|
||||
} else {
|
||||
while (cur) {
|
||||
if (cmp(cur, x) > 0) {
|
||||
prev = cur;
|
||||
cur = cur->next;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
x->*next = cur;
|
||||
if (prev) {
|
||||
prev->*next = x;
|
||||
if (cur == NULL) me->tail = x;
|
||||
} else {
|
||||
me->head = x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() {
|
||||
|
||||
+139
-121
@@ -180,6 +180,10 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
* resource cleanup in commFree(). */
|
||||
if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
|
||||
pthread_join(comm->proxyState->thread, nullptr);
|
||||
if (comm->proxyState->threadUDS) {
|
||||
// UDS support
|
||||
pthread_join(comm->proxyState->threadUDS, nullptr);;
|
||||
}
|
||||
}
|
||||
|
||||
delete[] comm->userRedOps;
|
||||
@@ -238,17 +242,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
free(comm->topParentRanks);
|
||||
free(comm->topParentLocalRanks);
|
||||
|
||||
while (!ncclIntruQueueEmpty(&comm->regRecordQueue)) {
|
||||
struct ncclRegRecord* rec = ncclIntruQueueDequeue(&comm->regRecordQueue);
|
||||
NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize));
|
||||
free(rec->addrs);
|
||||
free(rec);
|
||||
}
|
||||
|
||||
while (!ncclIntruQueueEmpty(&comm->regRequestQueue)) {
|
||||
struct ncclRegRequest* req = ncclIntruQueueDequeue(&comm->regRequestQueue);
|
||||
free(req);
|
||||
}
|
||||
NCCLCHECK(ncclRegCleanup(comm));
|
||||
|
||||
commPoison(comm); // poison comm before free to avoid comm reuse.
|
||||
free(comm);
|
||||
@@ -256,7 +250,6 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(AggChannelSize, "AGG_CHANNEL_SIZE", -2);
|
||||
NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0);
|
||||
// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
|
||||
NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
|
||||
@@ -288,7 +281,7 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
|
||||
/* comm must be ready, or error will be reported */
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
if (*comm->abortFlag) {
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
|
||||
ncclGroupJobAbort(comm->groupJob);
|
||||
} else {
|
||||
NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
|
||||
@@ -361,7 +354,6 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
|
||||
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
comm->channelSize = ncclParamAggChannelSize();
|
||||
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
|
||||
@@ -393,9 +385,9 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
comm->topParentRanks[i] = i;
|
||||
}
|
||||
|
||||
ncclIntruQueueConstruct(&comm->regRequestQueue);
|
||||
ncclIntruQueueConstruct(&comm->regRecordQueue);
|
||||
ncclIntruQueueMpscConstruct(&comm->callbackQueue);
|
||||
|
||||
comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -411,6 +403,8 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
comm->devComm = &devCommAndChans->comm;
|
||||
tmpCommAndChans.comm.rank = comm->rank;
|
||||
tmpCommAndChans.comm.nRanks = nRanks;
|
||||
tmpCommAndChans.comm.node = comm->node;
|
||||
tmpCommAndChans.comm.nNodes = comm->nNodes;
|
||||
tmpCommAndChans.comm.abortFlag = comm->abortFlag;
|
||||
for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
|
||||
tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
|
||||
@@ -443,6 +437,12 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
comm->workFifoSent = 0;
|
||||
comm->workFifoAckdMin = 0;
|
||||
|
||||
if (comm->collNetDenseToUserRank != nullptr) {
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
ncclCommPushCudaFree(comm, tmpCommAndChans.comm.collNetDenseToUserRank);
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
}
|
||||
|
||||
for (int c=0; c < MAXCHANNELS; c++) {
|
||||
tmpCommAndChans.channels[c].peers = comm->channels[c].devPeers;
|
||||
tmpCommAndChans.channels[c].ring = comm->channels[c].ring;
|
||||
@@ -499,6 +499,24 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
|
||||
NCCLCHECK(ncclGpuGdrSupport(comm, &info->gdrSupport));
|
||||
info->comm = comm;
|
||||
info->cudaCompCap = comm->minCompCap = comm->maxCompCap = comm->compCap;
|
||||
|
||||
// MNNVL support
|
||||
{
|
||||
// MNNVL: Request the fabric UUID and partition info
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
nvmlDevice_t nvmlDev;
|
||||
NCCLCHECK(int64ToBusId(info->busId, busId));
|
||||
NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev));
|
||||
info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
|
||||
(void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo);
|
||||
if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
|
||||
INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x",
|
||||
info->busId,
|
||||
((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
|
||||
info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
|
||||
}
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -542,8 +560,9 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
|
||||
comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
|
||||
}
|
||||
|
||||
if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
|
||||
else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
|
||||
// MNNVL support
|
||||
if (!comm->MNNVL && comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
|
||||
else if (comm->MNNVL || ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
|
||||
else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
|
||||
|
||||
// Make sure P2P chunksize is not larger than coll chunksize.
|
||||
@@ -573,6 +592,8 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
|
||||
int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_P2P };
|
||||
// Find all head ranks
|
||||
int nHeads = collNetGraph->nChannels;
|
||||
int nHeadsUnique = 0;
|
||||
int headsUnique[NCCL_MAX_LOCAL_RANKS];
|
||||
int highestTransportType0, highestTransportType1;
|
||||
char line[1024];
|
||||
bool share;
|
||||
@@ -584,13 +605,20 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
|
||||
struct collnetShareInfo* infos = NULL;
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&heads, nHeads), ret, fail);
|
||||
// Head GPU index is always 0
|
||||
for (int c = 0; c < nHeads; c++) {
|
||||
heads[c] = collNetGraph->intra[c * comm->localRanks + 0];
|
||||
{ uint64_t mask = 0;
|
||||
// Head GPU index is always 0
|
||||
for (int c = 0; c < nHeads; c++) {
|
||||
heads[c] = collNetGraph->intra[c * comm->localRanks + 0];
|
||||
assert(comm->rankToNode[heads[c]] == comm->node);
|
||||
uint64_t mask0 = mask;
|
||||
mask |= 1ull<<comm->rankToLocalRank[heads[c]];
|
||||
if (mask != mask0) headsUnique[nHeadsUnique++] = heads[c];
|
||||
}
|
||||
}
|
||||
|
||||
comm->collNetHeads = heads;
|
||||
comm->collNetHeadsNum = nHeads;
|
||||
comm->collNetHeadsUniqueNum = nHeadsUnique;
|
||||
if (parent && parent->collNetSupport && parent->config.splitShare && parent->nNodes == comm->nNodes) {
|
||||
NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail);
|
||||
/* check whether child can share collnet resources of parent. Since parent builds each collnet communicator
|
||||
@@ -651,6 +679,26 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
|
||||
NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1));
|
||||
comm->collNetChannels = comm->collNetSharedRes->nChannels = comm->nChannels;
|
||||
comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
|
||||
|
||||
comm->collNetDenseToUserRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
|
||||
comm->collNetUserToDenseRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
|
||||
{ // initialize collNetUserToDenseRank[rank]
|
||||
uint64_t nonHeadMask = (1ull<<comm->localRanks)-1;
|
||||
comm->collNetUserToDenseRank[rank] = -1;
|
||||
for (int h=0; h < nHeadsUnique; h++) {
|
||||
nonHeadMask ^= 1ull<<comm->rankToLocalRank[headsUnique[h]];
|
||||
if (headsUnique[h] == rank) { comm->collNetUserToDenseRank[rank] = h; break; }
|
||||
}
|
||||
if (comm->collNetUserToDenseRank[rank] == -1) {
|
||||
comm->collNetUserToDenseRank[rank] = __builtin_popcountll(nonHeadMask & ((1ull<<comm->localRank)-1));
|
||||
}
|
||||
comm->collNetUserToDenseRank[rank] += comm->node*comm->localRanks;
|
||||
}
|
||||
NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->collNetUserToDenseRank, sizeof(int)));
|
||||
for (int r=0; r < comm->nRanks; r++) {
|
||||
comm->collNetDenseToUserRank[comm->collNetUserToDenseRank[r]] = r;
|
||||
}
|
||||
|
||||
for (int c = 0; c < comm->collNetChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels + c;
|
||||
NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail);
|
||||
@@ -768,6 +816,9 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// MNNVL: Flag to indicate whether to enable Multi-Node NVLink
|
||||
NCCL_PARAM(MNNVL, "MNNVL", -2);
|
||||
|
||||
static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent = NULL) {
|
||||
// We use 2 AllGathers
|
||||
// 1. { peerInfo, comm, compCap}
|
||||
@@ -822,6 +873,56 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
}
|
||||
// AllGather1 - end
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cudawrap.h"
|
||||
|
||||
// MNNVL support
|
||||
{
|
||||
int cliqueSize = 0;
|
||||
comm->MNNVL = 0;
|
||||
// Determine the size of the MNNVL domain/clique
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[rank].fabricInfo;
|
||||
nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
|
||||
// Check that the Fabric state is fully initialized
|
||||
if (fabricInfo2->state != NVML_GPU_FABRIC_STATE_COMPLETED) continue;
|
||||
// Check that the cluster UUID and cliqueId match in each rank
|
||||
// A zero UUID means we don't have MNNVL fabric info - disable MNNVL
|
||||
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) continue;
|
||||
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
|
||||
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
|
||||
cliqueSize++;
|
||||
}
|
||||
}
|
||||
// Determine whether this is a MNNVL system
|
||||
comm->MNNVL = ncclParamMNNVL() < 0 ? cliqueSize == comm->nRanks : ncclParamMNNVL();
|
||||
// MNNVL requires cuMem to be enabled
|
||||
if (!ncclCuMemEnable()) comm->MNNVL = 0;
|
||||
if (comm->MNNVL) {
|
||||
// MNNVL also requires FABRIC handle support
|
||||
int cudaDev;
|
||||
int flag = 0;
|
||||
CUdevice currentDev;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
// Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
|
||||
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
|
||||
if (!flag)
|
||||
comm->MNNVL = 0;
|
||||
else
|
||||
// Force the handle type to be FABRIC for MNNVL
|
||||
ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
}
|
||||
if (ncclParamMNNVL() == 1 && !comm->MNNVL) {
|
||||
WARN("MNNVL is not supported on this system");
|
||||
ret = ncclSystemError;
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
do {
|
||||
// Compute intra-process ranks
|
||||
int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
|
||||
@@ -1019,6 +1120,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
goto fail;
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
|
||||
comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
|
||||
|
||||
nChannelsOrig = comm->nChannels;
|
||||
NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
|
||||
for (int i=0; i<nranks; i++) {
|
||||
@@ -1099,7 +1203,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
comm->topParentLocalRanks = topParentLocalRanks;
|
||||
|
||||
// Launch proxy service thread, after this, the proxy calls can be used.
|
||||
NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
|
||||
if (parent && parent->config.splitShare) {
|
||||
comm->proxyState = parent->sharedRes->proxyState;
|
||||
ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
|
||||
} else {
|
||||
NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
|
||||
}
|
||||
|
||||
// Connect with prev/next for each ring
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
@@ -1124,8 +1233,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
// Setup NVLS
|
||||
NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
|
||||
// And NVLS trees if needed
|
||||
if (comm->nvlsSupport && comm->localRanks > 1) {
|
||||
for (int c=0; c<comm->nvlsChannels; c++) {
|
||||
if (comm->nvlsSupport && comm->nNodes > 1) {
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail);
|
||||
@@ -1142,7 +1251,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
// Compute time models for algorithm and protocol combinations
|
||||
NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
|
||||
|
||||
INFO(NCCL_INIT, "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
||||
INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->collNetChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
||||
|
||||
do { // Setup p2p structures in comm->tasks
|
||||
struct ncclTasks* tasks = &comm->tasks;
|
||||
@@ -1376,14 +1485,15 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
|
||||
snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color);
|
||||
NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
|
||||
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); // Needed for UDS support
|
||||
NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail);
|
||||
} else {
|
||||
NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
|
||||
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); // Needed for UDS support
|
||||
NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail);
|
||||
}
|
||||
|
||||
comm->cudaArch = cudaArch;
|
||||
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES);
|
||||
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
|
||||
|
||||
@@ -1886,7 +1996,7 @@ static ncclResult_t commReclaim(ncclComm_t comm) {
|
||||
|
||||
NCCLCHECKGOTO(ncclCommGetAsyncError(comm, &state), ret, fail);
|
||||
TRACE(NCCL_INIT, "commReclaim: reclaim comm %p rank %d state %d", comm, comm->rank, state);
|
||||
if (state == ncclSuccess && *comm->abortFlag == 0 && comm->finalizeCalled == false) {
|
||||
if (state == ncclSuccess && __atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0 && comm->finalizeCalled == false) {
|
||||
/* user does not call ncclCommFinalize and this is a normal comm destroy. ncclCommDestroy
|
||||
* should be nonblocking until last call of ncclCommDestroy. */
|
||||
NCCLCHECKGOTO(commFinalize(comm, false), ret, fail);
|
||||
@@ -2011,9 +2121,9 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
|
||||
// Ask anything that might still be running on the device to quit
|
||||
childAbortFlag = __atomic_load_n(&comm->childAbortFlag, __ATOMIC_ACQUIRE);
|
||||
if (childAbortFlag != NULL) {
|
||||
*childAbortFlag = 1;
|
||||
__atomic_store_n(childAbortFlag, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
*comm->abortFlag = 1;
|
||||
__atomic_store_n(comm->abortFlag, 1, __ATOMIC_RELAXED);
|
||||
/* init thread must be joined before we destroy the comm,
|
||||
* and we should ignore the init error here. */
|
||||
ncclCommEnsureReady(comm);
|
||||
@@ -2161,98 +2271,6 @@ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
#if CUDART_VERSION >= 12010
|
||||
size_t granularity;
|
||||
if (ncclParamLocalRegister()) {
|
||||
if (comm == NCCL_COMM_NULL || buff == NULL || handle == NULL || size == 0) {
|
||||
WARN("Invalid arguments comm %p, buff %p, size %ld, handle %p", comm, buff, size, handle);
|
||||
ret = ncclInvalidArgument;
|
||||
} else if (comm->nvlsSupport) {
|
||||
CUmulticastObjectProp prop = comm->nvlsResources->properties;
|
||||
|
||||
prop.size = size;
|
||||
CUCHECK(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
|
||||
|
||||
if ((uintptr_t)buff % comm->nvlsResources->ucGran == 0 && size % granularity == 0) {
|
||||
/* we can direct register what user provide */
|
||||
struct ncclRegRequest* req;
|
||||
NCCLCHECK(ncclCalloc(&req, 1));
|
||||
req->buff = (uintptr_t)buff;
|
||||
req->size = size;
|
||||
ncclIntruQueueEnqueue(&comm->regRequestQueue, req);
|
||||
*handle = (void*)req;
|
||||
} else {
|
||||
void* base;
|
||||
size_t baseSize;
|
||||
/* Since we don't provide actually allocated buffer size for users by ncclMemAlloc,
|
||||
* therefore, we need to get the full range of the buffer by cuMemGetAddressRange to
|
||||
* register buffers. */
|
||||
CUCHECK(cuMemGetAddressRange((CUdeviceptr*)&base, &baseSize, (CUdeviceptr)buff));
|
||||
if ((uintptr_t)base % comm->nvlsResources->ucGran == 0 && baseSize % granularity == 0) {
|
||||
struct ncclRegRequest* req;
|
||||
NCCLCHECK(ncclCalloc(&req, 1));
|
||||
req->buff = (uintptr_t)base;
|
||||
req->size = baseSize;
|
||||
ncclIntruQueueEnqueue(&comm->regRequestQueue, req);
|
||||
*handle = (void*)req;
|
||||
} else {
|
||||
WARN("register fails, buffer %p (aligned %s, granularity %ld) and size %ld (aligned %s, granularity %ld) for registration", buff, (uintptr_t)buff % comm->nvlsResources->ucGran == 0 ? "TRUE" : "FALSE", comm->nvlsResources->ucGran, size, size % granularity == 0 ? "TRUE" : "FALSE", granularity);
|
||||
ret = ncclInvalidArgument;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
|
||||
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
#if CUDART_VERSION >= 12010
|
||||
struct ncclRegRequest* dreq = (struct ncclRegRequest*)handle;
|
||||
if (ncclParamLocalRegister()) {
|
||||
if (comm == NCCL_COMM_NULL || handle == NULL) {
|
||||
WARN("Invalid arguments comm %p, handle %p", comm, handle);
|
||||
ret = ncclInvalidArgument;
|
||||
} else {
|
||||
struct ncclRegRecord* rec;
|
||||
|
||||
/* first release register record */
|
||||
rec = ncclIntruQueueHead(&comm->regRecordQueue);
|
||||
|
||||
while (rec) {
|
||||
if (rec->buff == dreq->buff && rec->size == dreq->size) {
|
||||
NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize));
|
||||
ncclIntruQueueDelete(&comm->regRecordQueue, rec);
|
||||
free(rec->addrs);
|
||||
free(rec);
|
||||
break;
|
||||
}
|
||||
rec = rec->next;
|
||||
}
|
||||
|
||||
/* then free register request */
|
||||
if (ncclIntruQueueDelete(&comm->regRequestQueue, dreq) == false) {
|
||||
WARN("Invalid handle %p", handle);
|
||||
ret = ncclInvalidArgument;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
|
||||
ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
#include "argcheck.h"
|
||||
#include "comm.h"
|
||||
|
||||
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
|
||||
ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
|
||||
cudaPointerAttributes attr;
|
||||
cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
|
||||
if (err != cudaSuccess || attr.devicePointer == NULL) {
|
||||
|
||||
@@ -14,6 +14,9 @@
|
||||
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
|
||||
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
|
||||
|
||||
// Handle type used for cuMemCreate()
|
||||
CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
|
||||
static int ncclCuMemSupported = 0;
|
||||
|
||||
// Determine whether CUMEM & VMM RDMA is supported on this platform
|
||||
|
||||
@@ -132,7 +132,7 @@ ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
|
||||
WARN("UDS: Receiving data over socket failed : %d", errno);
|
||||
return ncclSystemError;
|
||||
}
|
||||
if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
|
||||
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
|
||||
}
|
||||
|
||||
if (recvFd != NULL) {
|
||||
@@ -221,7 +221,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
|
||||
WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
|
||||
return ncclSystemError;
|
||||
}
|
||||
if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
|
||||
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -39,6 +39,8 @@ namespace {
|
||||
NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
|
||||
NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
|
||||
NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values))
|
||||
// MNNVL support
|
||||
NCCL_NVML_FN(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t, (nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo))
|
||||
|
||||
std::mutex lock; // NVML has had some thread safety bugs
|
||||
bool initialized = false;
|
||||
@@ -82,7 +84,9 @@ ncclResult_t ncclNvmlEnsureInitialized() {
|
||||
{(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
|
||||
{(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
|
||||
{(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"},
|
||||
{(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"}
|
||||
{(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"},
|
||||
// MNNVL support
|
||||
{(void**)&pfn_nvmlDeviceGetGpuFabricInfoV, "nvmlDeviceGetGpuFabricInfoV"},
|
||||
};
|
||||
for(Symbol sym: symbols) {
|
||||
*sym.ppfn = dlsym(libhandle, sym.name);
|
||||
@@ -269,3 +273,12 @@ ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount,
|
||||
NVMLTRY(nvmlDeviceGetFieldValues, device, valuesCount, values);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// MNNVL support
|
||||
ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo) {
|
||||
NCCLCHECK(ncclNvmlEnsureInitialized());
|
||||
std::lock_guard<std::mutex> locked(lock);
|
||||
gpuFabricInfo->version = nvmlGpuFabricInfo_v2;
|
||||
NVMLTRY(nvmlDeviceGetGpuFabricInfoV, device, gpuFabricInfo);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -169,7 +169,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
|
||||
int curRound = shmem->round;
|
||||
size_t mycnt;
|
||||
|
||||
if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL) {
|
||||
if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || shmem->maxTypeSize < typeSize) {
|
||||
ret = ncclInvalidArgument;
|
||||
goto exit;
|
||||
}
|
||||
@@ -184,7 +184,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
|
||||
uint64_t t0 = clockNano();
|
||||
while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) {
|
||||
if (clockNano() - t0 >= 5 * 1000) sched_yield();
|
||||
if (*comm->abortFlag == 1) {
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 1) {
|
||||
ret = ncclInternalError;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
|
||||
}
|
||||
}
|
||||
(*offset) += bytes;
|
||||
if (sock->abortFlag && *sock->abortFlag != 0) {
|
||||
if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) {
|
||||
INFO(NCCL_NET, "socketProgressOpt: abort called");
|
||||
return ncclInternalError;
|
||||
}
|
||||
@@ -529,6 +529,8 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
|
||||
sock->state = ncclSocketStateConnecting;
|
||||
} else if (ret != EINPROGRESS) {
|
||||
sock->state = ncclSocketStateError;
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
WARN("socketPollConnect: Connect to %s returned %d(%s) errno %d(%s)", ncclSocketToString(&sock->addr, line), ret, strerror(ret), errno, strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -618,12 +620,12 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
|
||||
do {
|
||||
NCCLCHECK(socketProgressState(sock));
|
||||
} while (sock->asyncFlag == 0 &&
|
||||
(sock->abortFlag == NULL || *sock->abortFlag == 0) &&
|
||||
(sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED) == 0) &&
|
||||
(sock->state == ncclSocketStateConnecting ||
|
||||
sock->state == ncclSocketStateConnectPolling ||
|
||||
sock->state == ncclSocketStateConnected));
|
||||
|
||||
if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
|
||||
if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
|
||||
|
||||
switch (sock->state) {
|
||||
case ncclSocketStateConnecting:
|
||||
@@ -665,11 +667,11 @@ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listen
|
||||
do {
|
||||
NCCLCHECKGOTO(socketProgressState(sock), ret, exit);
|
||||
} while (sock->asyncFlag == 0 &&
|
||||
(sock->abortFlag == NULL || *sock->abortFlag == 0) &&
|
||||
(sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED) == 0) &&
|
||||
(sock->state == ncclSocketStateAccepting ||
|
||||
sock->state == ncclSocketStateAccepted));
|
||||
|
||||
if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
|
||||
if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
|
||||
|
||||
switch (sock->state) {
|
||||
case ncclSocketStateAccepting:
|
||||
|
||||
+18
-18
@@ -30,25 +30,25 @@ ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner) {
|
||||
if (name) {
|
||||
INFO(NCCL_TUNING, "NCCL_TUNER_PLUGIN set to %s", name);
|
||||
tunerPluginLib = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
|
||||
}
|
||||
if (tunerPluginLib == nullptr) {
|
||||
// dlopen does not guarantee to set errno, but dlerror only gives us a
|
||||
// string, so checking errno doesn't hurt to try to provide a better
|
||||
// error message
|
||||
if (errno == ENOENT) {
|
||||
INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name);
|
||||
if (tunerPluginLib == nullptr) {
|
||||
// dlopen does not guarantee to set errno, but dlerror only gives us a
|
||||
// string, so checking errno doesn't hurt to try to provide a better
|
||||
// error message
|
||||
if (errno == ENOENT) {
|
||||
INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name);
|
||||
} else {
|
||||
INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror());
|
||||
}
|
||||
} else {
|
||||
INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror());
|
||||
}
|
||||
} else {
|
||||
tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
|
||||
if (tunerSymbol == nullptr) {
|
||||
INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name);
|
||||
dlclose(tunerPluginLib);
|
||||
tunerPluginLib = nullptr;
|
||||
} else {
|
||||
INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name);
|
||||
tunerPluginRefCount = 0;
|
||||
tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
|
||||
if (tunerSymbol == nullptr) {
|
||||
INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name);
|
||||
dlclose(tunerPluginLib);
|
||||
tunerPluginLib = nullptr;
|
||||
} else {
|
||||
INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name);
|
||||
tunerPluginRefCount = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -291,3 +291,79 @@ void ncclMemoryStackDestruct(struct ncclMemoryStack* me) {
|
||||
h = h1;
|
||||
}
|
||||
}
|
||||
|
||||
const char* ncclOpToString(ncclRedOp_t op) {
|
||||
switch (op) {
|
||||
case ncclSum:
|
||||
return "ncclSum";
|
||||
case ncclProd:
|
||||
return "ncclProd";
|
||||
case ncclMax:
|
||||
return "ncclMax";
|
||||
case ncclMin:
|
||||
return "ncclMin";
|
||||
case ncclAvg:
|
||||
return "ncclAvg";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char* ncclDatatypeToString(ncclDataType_t type) {
|
||||
switch (type) {
|
||||
case ncclInt8: // ncclChar
|
||||
return "ncclInt8";
|
||||
case ncclInt32: // ncclInt
|
||||
return "ncclInt32";
|
||||
case ncclUint32:
|
||||
return "ncclUint32";
|
||||
case ncclInt64:
|
||||
return "ncclInt64";
|
||||
case ncclUint64:
|
||||
return "ncclUint64";
|
||||
case ncclFloat16: // ncclHalf
|
||||
return "ncclFloat16";
|
||||
case ncclFloat32: // ncclFloat
|
||||
return "ncclFloat32";
|
||||
case ncclFloat64: // ncclDouble
|
||||
return "ncclFloat64";
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
case ncclBfloat16:
|
||||
return "ncclBfloat16";
|
||||
#endif
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char* ncclAlgoToString(int algo) {
|
||||
switch (algo) {
|
||||
case NCCL_ALGO_TREE:
|
||||
return "TREE";
|
||||
case NCCL_ALGO_RING:
|
||||
return "RING";
|
||||
case NCCL_ALGO_COLLNET_DIRECT:
|
||||
return "COLLNET_DIRECT";
|
||||
case NCCL_ALGO_COLLNET_CHAIN:
|
||||
return "COLLNET_CHAIN";
|
||||
case NCCL_ALGO_NVLS:
|
||||
return "NVLS";
|
||||
case NCCL_ALGO_NVLS_TREE:
|
||||
return "NVLS_TREE";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char* ncclProtoToString(int proto) {
|
||||
switch (proto) {
|
||||
case NCCL_PROTO_LL:
|
||||
return "LL";
|
||||
case NCCL_PROTO_LL128:
|
||||
return "LL128";
|
||||
case NCCL_PROTO_SIMPLE:
|
||||
return "SIMPLE";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
+10
-3
@@ -154,9 +154,7 @@ ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *new
|
||||
const char* ncclGetErrorString(ncclResult_t result);
|
||||
const char* pncclGetErrorString(ncclResult_t result);
|
||||
|
||||
/* Returns a human-readable message of the last error that occurred.
|
||||
* comm is currently unused and can be set to NULL
|
||||
*/
|
||||
/* Returns a human-readable message of the last error that occurred. */
|
||||
const char* ncclGetLastError(ncclComm_t comm);
|
||||
const char* pncclGetLastError(ncclComm_t comm);
|
||||
|
||||
@@ -176,6 +174,15 @@ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
|
||||
|
||||
/* Register CUDA buffer for zero-copy operation */
|
||||
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
|
||||
/* Deregister CUDA buffer */
|
||||
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
|
||||
ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
|
||||
|
||||
/* Reduction operation selector */
|
||||
typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
|
||||
typedef enum { ncclSum = 0,
|
||||
|
||||
+267
-123
@@ -15,16 +15,67 @@
|
||||
//#include <sys/stat.h>
|
||||
//#include <unistd.h>
|
||||
|
||||
static ncclNet_v7_t ncclNet_v5_as_v7;
|
||||
static ncclNet_v7_t ncclNet_v6_as_v7;
|
||||
static ncclNet_v8_t ncclNet_v5_as_v8;
|
||||
static ncclNet_v8_t ncclNet_v6_as_v8;
|
||||
static ncclNet_v8_t ncclNet_v7_as_v8;
|
||||
static ncclNet_v5_t *ncclNet_v5;
|
||||
static ncclNet_v6_t *ncclNet_v6;
|
||||
static ncclCollNet_v7_t ncclCollNet_v5_as_v7;
|
||||
static ncclCollNet_v7_t ncclCollNet_v6_as_v7;
|
||||
static ncclNet_v7_t *ncclNet_v7;
|
||||
static ncclCollNet_v8_t ncclCollNet_v5_as_v8;
|
||||
static ncclCollNet_v8_t ncclCollNet_v6_as_v8;
|
||||
static ncclCollNet_v8_t ncclCollNet_v7_as_v8;
|
||||
static ncclCollNet_v5_t *ncclCollNet_v5;
|
||||
static ncclCollNet_v6_t *ncclCollNet_v6;
|
||||
static ncclCollNet_v7_t *ncclCollNet_v7;
|
||||
|
||||
static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
|
||||
static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
ncclNetProperties_v7_t p7;
|
||||
ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
props->name = p7.name;
|
||||
props->pciPath = p7.pciPath;
|
||||
props->guid = p7.guid;
|
||||
props->ptrSupport = p7.ptrSupport;
|
||||
props->regIsGlobal = 0;
|
||||
props->speed = p7.speed;
|
||||
props->port = p7.port;
|
||||
props->maxComms = p7.maxComms;
|
||||
props->maxRecvs = p7.maxRecvs;
|
||||
props->latency = p7.latency;
|
||||
props->netDeviceType = p7.netDeviceType;
|
||||
props->netDeviceVersion = p7.netDeviceVersion;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
if (size >= 1<<31) return ncclInternalError;
|
||||
return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v7_as_v8_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclNet_v7->init(logfn));
|
||||
ncclNet_v7_as_v8.name = ncclNet_v7->name;
|
||||
ncclNet_v7_as_v8.devices = ncclNet_v7->devices;
|
||||
ncclNet_v7_as_v8.getProperties = ncclNet_v7_as_v8_getProperties; // ncclNet_v5->getProperties;
|
||||
ncclNet_v7_as_v8.listen = ncclNet_v7->listen;
|
||||
ncclNet_v7_as_v8.connect = ncclNet_v7->connect;
|
||||
ncclNet_v7_as_v8.accept = ncclNet_v7->accept;
|
||||
ncclNet_v7_as_v8.regMr = ncclNet_v7_as_v8_regMr;
|
||||
ncclNet_v7_as_v8.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
|
||||
ncclNet_v7_as_v8.deregMr = ncclNet_v7->deregMr;
|
||||
ncclNet_v7_as_v8.isend = ncclNet_v7->isend;
|
||||
ncclNet_v7_as_v8.irecv = ncclNet_v7->irecv;
|
||||
ncclNet_v7_as_v8.iflush = ncclNet_v7->iflush;
|
||||
ncclNet_v7_as_v8.test = ncclNet_v7->test;
|
||||
ncclNet_v7_as_v8.closeSend = ncclNet_v7->closeSend;
|
||||
ncclNet_v7_as_v8.closeRecv = ncclNet_v7->closeRecv;
|
||||
ncclNet_v7_as_v8.closeListen = ncclNet_v7->closeListen;
|
||||
ncclNet_v7_as_v8.getDeviceMr = ncclNet_v7->getDeviceMr;
|
||||
ncclNet_v7_as_v8.irecvConsumed = ncclNet_v7->irecvConsumed;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
ncclNetProperties_v6_t p6;
|
||||
ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
@@ -32,6 +83,7 @@ static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7
|
||||
props->pciPath = p6.pciPath;
|
||||
props->guid = p6.guid;
|
||||
props->ptrSupport = p6.ptrSupport;
|
||||
props->regIsGlobal = 0;
|
||||
props->speed = p6.speed;
|
||||
props->port = p6.port;
|
||||
props->maxComms = p6.maxComms;
|
||||
@@ -42,38 +94,43 @@ static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v6_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
|
||||
static ncclResult_t ncclNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
if (size >= 1<<31) return ncclInternalError;
|
||||
return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v6_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
|
||||
return ncclNet_v6->connect(dev, handle, sendComm);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v6_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
|
||||
static ncclResult_t ncclNet_v6_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
|
||||
return ncclNet_v6->accept(listenComm, recvComm);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v6_as_v7_init(ncclDebugLogger_t logfn) {
|
||||
static ncclResult_t ncclNet_v6_as_v8_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclNet_v6->init(logfn));
|
||||
ncclNet_v6_as_v7.name = ncclNet_v6->name;
|
||||
ncclNet_v6_as_v7.devices = ncclNet_v6->devices;
|
||||
ncclNet_v6_as_v7.getProperties = ncclNet_v6_as_v7_getProperties; // ncclNet_v5->getProperties;
|
||||
ncclNet_v6_as_v7.listen = ncclNet_v6->listen;
|
||||
ncclNet_v6_as_v7.connect = ncclNet_v6_as_v7_connect;
|
||||
ncclNet_v6_as_v7.accept = ncclNet_v6_as_v7_accept;
|
||||
ncclNet_v6_as_v7.regMr = ncclNet_v6->regMr;
|
||||
ncclNet_v6_as_v7.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
|
||||
ncclNet_v6_as_v7.deregMr = ncclNet_v6->deregMr;
|
||||
ncclNet_v6_as_v7.isend = ncclNet_v6->isend;
|
||||
ncclNet_v6_as_v7.irecv = ncclNet_v6->irecv;
|
||||
ncclNet_v6_as_v7.iflush = ncclNet_v6->iflush;
|
||||
ncclNet_v6_as_v7.test = ncclNet_v6->test;
|
||||
ncclNet_v6_as_v7.closeSend = ncclNet_v6->closeSend;
|
||||
ncclNet_v6_as_v7.closeRecv = ncclNet_v6->closeRecv;
|
||||
ncclNet_v6_as_v7.closeListen = ncclNet_v6->closeListen;
|
||||
ncclNet_v6_as_v7.getDeviceMr = NULL;
|
||||
ncclNet_v6_as_v7.irecvConsumed = NULL;
|
||||
ncclNet_v6_as_v8.name = ncclNet_v6->name;
|
||||
ncclNet_v6_as_v8.devices = ncclNet_v6->devices;
|
||||
ncclNet_v6_as_v8.getProperties = ncclNet_v6_as_v8_getProperties; // ncclNet_v5->getProperties;
|
||||
ncclNet_v6_as_v8.listen = ncclNet_v6->listen;
|
||||
ncclNet_v6_as_v8.connect = ncclNet_v6_as_v8_connect;
|
||||
ncclNet_v6_as_v8.accept = ncclNet_v6_as_v8_accept;
|
||||
ncclNet_v6_as_v8.regMr = ncclNet_v6_as_v8_regMr;
|
||||
ncclNet_v6_as_v8.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
|
||||
ncclNet_v6_as_v8.deregMr = ncclNet_v6->deregMr;
|
||||
ncclNet_v6_as_v8.isend = ncclNet_v6->isend;
|
||||
ncclNet_v6_as_v8.irecv = ncclNet_v6->irecv;
|
||||
ncclNet_v6_as_v8.iflush = ncclNet_v6->iflush;
|
||||
ncclNet_v6_as_v8.test = ncclNet_v6->test;
|
||||
ncclNet_v6_as_v8.closeSend = ncclNet_v6->closeSend;
|
||||
ncclNet_v6_as_v8.closeRecv = ncclNet_v6->closeRecv;
|
||||
ncclNet_v6_as_v8.closeListen = ncclNet_v6->closeListen;
|
||||
ncclNet_v6_as_v8.getDeviceMr = NULL;
|
||||
ncclNet_v6_as_v8.irecvConsumed = NULL;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
|
||||
static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
ncclNetProperties_v6_t p6;
|
||||
ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
@@ -81,6 +138,7 @@ static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7
|
||||
props->pciPath = p6.pciPath;
|
||||
props->guid = p6.guid;
|
||||
props->ptrSupport = p6.ptrSupport;
|
||||
props->regIsGlobal = 0;
|
||||
props->speed = p6.speed;
|
||||
props->port = p6.port;
|
||||
props->maxComms = p6.maxComms;
|
||||
@@ -91,40 +149,45 @@ static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v5_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
|
||||
static ncclResult_t ncclNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
if (size >= 1<<31) return ncclInternalError;
|
||||
return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v5_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
|
||||
return ncclNet_v5->connect(dev, handle, sendComm);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v5_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
|
||||
static ncclResult_t ncclNet_v5_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
|
||||
return ncclNet_v5->accept(listenComm, recvComm);
|
||||
}
|
||||
|
||||
// We use a wrapper around the v5 init to copy over the struct contents
|
||||
// post-init since they may not be initialized before hand.
|
||||
static ncclResult_t ncclNet_v5_as_v7_init(ncclDebugLogger_t logfn) {
|
||||
static ncclResult_t ncclNet_v5_as_v8_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclNet_v5->init(logfn));
|
||||
ncclNet_v5_as_v7.name = ncclNet_v5->name;
|
||||
ncclNet_v5_as_v7.devices = ncclNet_v5->devices;
|
||||
ncclNet_v5_as_v7.getProperties = ncclNet_v5_as_v7_getProperties;
|
||||
ncclNet_v5_as_v7.listen = ncclNet_v5->listen;
|
||||
ncclNet_v5_as_v7.connect = ncclNet_v5_as_v7_connect;
|
||||
ncclNet_v5_as_v7.accept = ncclNet_v5_as_v7_accept;
|
||||
ncclNet_v5_as_v7.regMr = ncclNet_v5->regMr;
|
||||
ncclNet_v5_as_v7.regMrDmaBuf = NULL;
|
||||
ncclNet_v5_as_v7.deregMr = ncclNet_v5->deregMr;
|
||||
ncclNet_v5_as_v7.isend = ncclNet_v5->isend;
|
||||
ncclNet_v5_as_v7.irecv = ncclNet_v5->irecv;
|
||||
ncclNet_v5_as_v7.iflush = ncclNet_v5->iflush;
|
||||
ncclNet_v5_as_v7.test = ncclNet_v5->test;
|
||||
ncclNet_v5_as_v7.closeSend = ncclNet_v5->closeSend;
|
||||
ncclNet_v5_as_v7.closeRecv = ncclNet_v5->closeRecv;
|
||||
ncclNet_v5_as_v7.closeListen = ncclNet_v5->closeListen;
|
||||
ncclNet_v5_as_v7.getDeviceMr = NULL;
|
||||
ncclNet_v5_as_v7.irecvConsumed = NULL;
|
||||
ncclNet_v5_as_v8.name = ncclNet_v5->name;
|
||||
ncclNet_v5_as_v8.devices = ncclNet_v5->devices;
|
||||
ncclNet_v5_as_v8.getProperties = ncclNet_v5_as_v8_getProperties;
|
||||
ncclNet_v5_as_v8.listen = ncclNet_v5->listen;
|
||||
ncclNet_v5_as_v8.connect = ncclNet_v5_as_v8_connect;
|
||||
ncclNet_v5_as_v8.accept = ncclNet_v5_as_v8_accept;
|
||||
ncclNet_v5_as_v8.regMr = ncclNet_v5_as_v8_regMr;
|
||||
ncclNet_v5_as_v8.regMrDmaBuf = NULL;
|
||||
ncclNet_v5_as_v8.deregMr = ncclNet_v5->deregMr;
|
||||
ncclNet_v5_as_v8.isend = ncclNet_v5->isend;
|
||||
ncclNet_v5_as_v8.irecv = ncclNet_v5->irecv;
|
||||
ncclNet_v5_as_v8.iflush = ncclNet_v5->iflush;
|
||||
ncclNet_v5_as_v8.test = ncclNet_v5->test;
|
||||
ncclNet_v5_as_v8.closeSend = ncclNet_v5->closeSend;
|
||||
ncclNet_v5_as_v8.closeRecv = ncclNet_v5->closeRecv;
|
||||
ncclNet_v5_as_v8.closeListen = ncclNet_v5->closeListen;
|
||||
ncclNet_v5_as_v8.getDeviceMr = NULL;
|
||||
ncclNet_v5_as_v8.irecvConsumed = NULL;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
|
||||
static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
ncclNetProperties_v6_t p6;
|
||||
ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
@@ -132,6 +195,7 @@ static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetPropertie
|
||||
props->pciPath = p6.pciPath;
|
||||
props->guid = p6.guid;
|
||||
props->ptrSupport = p6.ptrSupport;
|
||||
props->regIsGlobal = 0;
|
||||
props->speed = p6.speed;
|
||||
props->port = p6.port;
|
||||
props->maxComms = p6.maxComms;
|
||||
@@ -142,28 +206,35 @@ static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetPropertie
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCollNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
if (size >= 1<<31) return ncclInternalError;
|
||||
return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle);
|
||||
}
|
||||
|
||||
// We use a wrapper around the v5 init to copy over the struct contents
|
||||
// post-init since they may not be initialized before hand.
|
||||
static ncclResult_t ncclCollNet_v5_as_v7_init(ncclDebugLogger_t logfn) {
|
||||
static ncclResult_t ncclCollNet_v5_as_v8_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclCollNet_v5->init(logfn));
|
||||
ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name;
|
||||
ncclCollNet_v5_as_v7.devices = ncclCollNet_v5->devices;
|
||||
ncclCollNet_v5_as_v7.getProperties = ncclCollNet_v5_as_v7_getProperties;
|
||||
ncclCollNet_v5_as_v7.listen = ncclCollNet_v5->listen;
|
||||
ncclCollNet_v5_as_v7.connect = ncclCollNet_v5->connect;
|
||||
ncclCollNet_v5_as_v7.reduceSupport = ncclCollNet_v5->reduceSupport;
|
||||
ncclCollNet_v5_as_v7.regMr = ncclCollNet_v5->regMr;
|
||||
ncclCollNet_v5_as_v7.regMrDmaBuf = NULL;
|
||||
ncclCollNet_v5_as_v7.deregMr = ncclCollNet_v5->deregMr;
|
||||
ncclCollNet_v5_as_v7.iallreduce = ncclCollNet_v5->iallreduce;
|
||||
ncclCollNet_v5_as_v7.iflush = ncclCollNet_v5->iflush;
|
||||
ncclCollNet_v5_as_v7.test = ncclCollNet_v5->test;
|
||||
ncclCollNet_v5_as_v7.closeColl = ncclCollNet_v5->closeColl;
|
||||
ncclCollNet_v5_as_v7.closeListen = ncclCollNet_v5->closeListen;
|
||||
ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
|
||||
ncclCollNet_v5_as_v8.devices = ncclCollNet_v5->devices;
|
||||
ncclCollNet_v5_as_v8.getProperties = ncclCollNet_v5_as_v8_getProperties;
|
||||
ncclCollNet_v5_as_v8.listen = ncclCollNet_v5->listen;
|
||||
ncclCollNet_v5_as_v8.connect = ncclCollNet_v5->connect;
|
||||
ncclCollNet_v5_as_v8.reduceSupport = ncclCollNet_v5->reduceSupport;
|
||||
ncclCollNet_v5_as_v8.regMr = ncclCollNet_v5_as_v8_regMr;
|
||||
ncclCollNet_v5_as_v8.regMrDmaBuf = NULL;
|
||||
ncclCollNet_v5_as_v8.deregMr = ncclCollNet_v5->deregMr;
|
||||
ncclCollNet_v5_as_v8.iallreduce = ncclCollNet_v5->iallreduce;
|
||||
ncclCollNet_v5_as_v8.iallgather = nullptr;
|
||||
ncclCollNet_v5_as_v8.ireducescatter = nullptr;
|
||||
ncclCollNet_v5_as_v8.iflush = ncclCollNet_v5->iflush;
|
||||
ncclCollNet_v5_as_v8.test = ncclCollNet_v5->test;
|
||||
ncclCollNet_v5_as_v8.closeColl = ncclCollNet_v5->closeColl;
|
||||
ncclCollNet_v5_as_v8.closeListen = ncclCollNet_v5->closeListen;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
|
||||
static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
ncclNetProperties_v6_t p6;
|
||||
ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
@@ -171,6 +242,7 @@ static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetPropertie
|
||||
props->pciPath = p6.pciPath;
|
||||
props->guid = p6.guid;
|
||||
props->ptrSupport = p6.ptrSupport;
|
||||
props->regIsGlobal = 0;
|
||||
props->speed = p6.speed;
|
||||
props->port = p6.port;
|
||||
props->maxComms = p6.maxComms;
|
||||
@@ -181,24 +253,78 @@ static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetPropertie
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// We use a wrapper around the v5 init to copy over the struct contents
|
||||
static ncclResult_t ncclCollNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
if (size >= 1<<31) return ncclInternalError;
|
||||
return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
|
||||
}
|
||||
|
||||
// We use a wrapper around the v6 init to copy over the struct contents
|
||||
// post-init since they may not be initialized before hand.
|
||||
static ncclResult_t ncclCollNet_v6_as_v7_init(ncclDebugLogger_t logfn) {
|
||||
static ncclResult_t ncclCollNet_v6_as_v8_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclCollNet_v6->init(logfn));
|
||||
ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name;
|
||||
ncclCollNet_v6_as_v7.devices = ncclCollNet_v6->devices;
|
||||
ncclCollNet_v6_as_v7.getProperties = ncclCollNet_v6_as_v7_getProperties;
|
||||
ncclCollNet_v6_as_v7.listen = ncclCollNet_v6->listen;
|
||||
ncclCollNet_v6_as_v7.connect = ncclCollNet_v6->connect;
|
||||
ncclCollNet_v6_as_v7.reduceSupport = ncclCollNet_v6->reduceSupport;
|
||||
ncclCollNet_v6_as_v7.regMr = ncclCollNet_v6->regMr;
|
||||
ncclCollNet_v6_as_v7.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
|
||||
ncclCollNet_v6_as_v7.deregMr = ncclCollNet_v6->deregMr;
|
||||
ncclCollNet_v6_as_v7.iallreduce = ncclCollNet_v6->iallreduce;
|
||||
ncclCollNet_v6_as_v7.iflush = ncclCollNet_v6->iflush;
|
||||
ncclCollNet_v6_as_v7.test = ncclCollNet_v6->test;
|
||||
ncclCollNet_v6_as_v7.closeColl = ncclCollNet_v6->closeColl;
|
||||
ncclCollNet_v6_as_v7.closeListen = ncclCollNet_v6->closeListen;
|
||||
ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
|
||||
ncclCollNet_v6_as_v8.devices = ncclCollNet_v6->devices;
|
||||
ncclCollNet_v6_as_v8.getProperties = ncclCollNet_v6_as_v8_getProperties;
|
||||
ncclCollNet_v6_as_v8.listen = ncclCollNet_v6->listen;
|
||||
ncclCollNet_v6_as_v8.connect = ncclCollNet_v6->connect;
|
||||
ncclCollNet_v6_as_v8.reduceSupport = ncclCollNet_v6->reduceSupport;
|
||||
ncclCollNet_v6_as_v8.regMr = ncclCollNet_v6_as_v8_regMr;
|
||||
ncclCollNet_v6_as_v8.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
|
||||
ncclCollNet_v6_as_v8.deregMr = ncclCollNet_v6->deregMr;
|
||||
ncclCollNet_v6_as_v8.iallreduce = ncclCollNet_v6->iallreduce;
|
||||
ncclCollNet_v6_as_v8.iallgather = nullptr;
|
||||
ncclCollNet_v6_as_v8.ireducescatter = nullptr;
|
||||
ncclCollNet_v6_as_v8.iflush = ncclCollNet_v6->iflush;
|
||||
ncclCollNet_v6_as_v8.test = ncclCollNet_v6->test;
|
||||
ncclCollNet_v6_as_v8.closeColl = ncclCollNet_v6->closeColl;
|
||||
ncclCollNet_v6_as_v8.closeListen = ncclCollNet_v6->closeListen;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
ncclNetProperties_v7_t p7;
|
||||
ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
props->name = p7.name;
|
||||
props->pciPath = p7.pciPath;
|
||||
props->guid = p7.guid;
|
||||
props->ptrSupport = p7.ptrSupport;
|
||||
props->regIsGlobal = 0;
|
||||
props->speed = p7.speed;
|
||||
props->port = p7.port;
|
||||
props->maxComms = p7.maxComms;
|
||||
props->maxRecvs = p7.maxRecvs;
|
||||
props->latency = p7.latency;
|
||||
props->netDeviceType = NCCL_NET_DEVICE_HOST;
|
||||
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCollNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
if (size >= 1<<31) return ncclInternalError;
|
||||
return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
|
||||
}
|
||||
|
||||
// We use a wrapper around the v7 init to copy over the struct contents
|
||||
// post-init since they may not be initialized before hand.
|
||||
static ncclResult_t ncclCollNet_v7_as_v8_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclCollNet_v7->init(logfn));
|
||||
ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
|
||||
ncclCollNet_v7_as_v8.devices = ncclCollNet_v7->devices;
|
||||
ncclCollNet_v7_as_v8.getProperties = ncclCollNet_v7_as_v8_getProperties;
|
||||
ncclCollNet_v7_as_v8.listen = ncclCollNet_v7->listen;
|
||||
ncclCollNet_v7_as_v8.connect = ncclCollNet_v7->connect;
|
||||
ncclCollNet_v7_as_v8.reduceSupport = ncclCollNet_v7->reduceSupport;
|
||||
ncclCollNet_v7_as_v8.regMr = ncclCollNet_v7_as_v8_regMr;
|
||||
ncclCollNet_v7_as_v8.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
|
||||
ncclCollNet_v7_as_v8.deregMr = ncclCollNet_v7->deregMr;
|
||||
ncclCollNet_v7_as_v8.iallreduce = ncclCollNet_v7->iallreduce;
|
||||
ncclCollNet_v7_as_v8.iallgather = nullptr;
|
||||
ncclCollNet_v7_as_v8.ireducescatter = nullptr;
|
||||
ncclCollNet_v7_as_v8.iflush = ncclCollNet_v7->iflush;
|
||||
ncclCollNet_v7_as_v8.test = ncclCollNet_v7->test;
|
||||
ncclCollNet_v7_as_v8.closeColl = ncclCollNet_v7->closeColl;
|
||||
ncclCollNet_v7_as_v8.closeListen = ncclCollNet_v7->closeListen;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -236,54 +362,72 @@ ncclResult_t ncclNetPluginInit() {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclNets[0] = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
|
||||
ncclNets[0] = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
|
||||
if (ncclNets[0] == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol.");
|
||||
// Try v6 plugin
|
||||
ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
|
||||
if (ncclNet_v6 == nullptr) {
|
||||
// Try v5 plugin
|
||||
ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
|
||||
if (ncclNet_v5 == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
|
||||
if (netPluginLib != nullptr) dlclose(netPluginLib);
|
||||
return ncclSuccess;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol.");
|
||||
// Try v7 plugin
|
||||
ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
|
||||
if (ncclNet_v7 == nullptr) {
|
||||
// Try v6 plugin
|
||||
ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
|
||||
if (ncclNet_v6 == nullptr) {
|
||||
// Try v5 plugin
|
||||
ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
|
||||
if (ncclNet_v5 == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
|
||||
if (netPluginLib != nullptr) dlclose(netPluginLib);
|
||||
return ncclSuccess;
|
||||
} else {
|
||||
ncclNets[0] = &ncclNet_v5_as_v8;
|
||||
ncclNet_v5_as_v8.init = ncclNet_v5_as_v8_init;
|
||||
// Set the name right away to allow for NCCL_NET=... to work
|
||||
ncclNet_v5_as_v8.name = ncclNet_v5->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
|
||||
}
|
||||
} else {
|
||||
ncclNets[0] = &ncclNet_v5_as_v7;
|
||||
ncclNet_v5_as_v7.init = ncclNet_v5_as_v7_init;
|
||||
ncclNets[0] = &ncclNet_v6_as_v8;
|
||||
ncclNet_v6_as_v8.init = ncclNet_v6_as_v8_init;
|
||||
// Set the name right away to allow for NCCL_NET=... to work
|
||||
ncclNet_v5_as_v7.name = ncclNet_v5->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
|
||||
ncclNet_v6_as_v8.name = ncclNet_v6->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
|
||||
}
|
||||
} else {
|
||||
ncclNets[0] = &ncclNet_v6_as_v7;
|
||||
ncclNet_v6_as_v7.init = ncclNet_v6_as_v7_init;
|
||||
ncclNets[0] = &ncclNet_v7_as_v8;
|
||||
ncclNet_v7_as_v8.init = ncclNet_v7_as_v8_init;
|
||||
// Set the name right away to allow for NCCL_NET=... to work
|
||||
ncclNet_v6_as_v7.name = ncclNet_v6->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
|
||||
ncclNet_v7_as_v8.name = ncclNet_v7->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for CollNet
|
||||
ncclCollNets[0] = (ncclCollNet_v7_t*) dlsym(netPluginLib, "ncclCollNetPlugin_v7");
|
||||
ncclCollNets[0] = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8");
|
||||
if (ncclCollNets[0] == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol.");
|
||||
ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
|
||||
if (ncclCollNet_v6 == nullptr) {
|
||||
ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
|
||||
if (ncclCollNet_v5 == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol.");
|
||||
ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7");
|
||||
if (ncclCollNet_v7 == nullptr) {
|
||||
ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
|
||||
if (ncclCollNet_v6 == nullptr) {
|
||||
ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
|
||||
if (ncclCollNet_v5 == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
|
||||
} else {
|
||||
ncclCollNets[0] = &ncclCollNet_v5_as_v8;
|
||||
ncclCollNet_v5_as_v8.init = ncclCollNet_v5_as_v8_init;
|
||||
ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
|
||||
}
|
||||
} else {
|
||||
ncclCollNets[0] = &ncclCollNet_v5_as_v7;
|
||||
ncclCollNet_v5_as_v7.init = ncclCollNet_v5_as_v7_init;
|
||||
ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
|
||||
ncclCollNets[0] = &ncclCollNet_v6_as_v8;
|
||||
ncclCollNet_v6_as_v8.init = ncclCollNet_v6_as_v8_init;
|
||||
ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v6)", ncclCollNets[0]->name);
|
||||
}
|
||||
} else {
|
||||
ncclCollNets[0] = &ncclCollNet_v6_as_v7;
|
||||
ncclCollNet_v6_as_v7.init = ncclCollNet_v6_as_v7_init;
|
||||
ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v6)", ncclCollNets[0]->name);
|
||||
ncclCollNets[0] = &ncclCollNet_v7_as_v8;
|
||||
ncclCollNet_v7_as_v8.init = ncclCollNet_v7_as_v8_init;
|
||||
ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v7)", ncclCollNets[0]->name);
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -329,6 +473,7 @@ static ncclResult_t netGetState(int i, enum ncclNetState* state) {
|
||||
}
|
||||
|
||||
static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
|
||||
pthread_mutex_lock(&netLock);
|
||||
if (ncclCollNetStates[i] == ncclNetStateInit) {
|
||||
int ndev;
|
||||
if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
|
||||
@@ -336,6 +481,7 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
|
||||
else ncclCollNetStates[i] = ncclNetStateEnabled;
|
||||
}
|
||||
*state = ncclCollNetStates[i];
|
||||
pthread_mutex_unlock(&netLock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -416,7 +562,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
|
||||
while (!connected) {
|
||||
|
||||
// If we're aborting now, skip to cleanup
|
||||
if (*comm->abortFlag) {
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
|
||||
goto cleanup2;
|
||||
}
|
||||
|
||||
@@ -453,11 +599,9 @@ cleanup1:
|
||||
}
|
||||
|
||||
int ncclNetVersion(struct ncclComm* comm) {
|
||||
if (comm->ncclNet == &ncclNet_v5_as_v7) {
|
||||
return 5;
|
||||
} else if (comm->ncclNet == &ncclNet_v6_as_v7) {
|
||||
return 6;
|
||||
} else {
|
||||
return 7;
|
||||
}
|
||||
return
|
||||
(comm->ncclNet == &ncclNet_v5_as_v8) ? 5 :
|
||||
(comm->ncclNet == &ncclNet_v6_as_v8) ? 6 :
|
||||
(comm->ncclNet == &ncclNet_v7_as_v8) ? 7 :
|
||||
8;
|
||||
}
|
||||
|
||||
+142
-40
@@ -353,20 +353,22 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
|
||||
WARN("Proxy append out of bounds");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
//memset(sub, 0, sizeof(struct ncclProxySubArgs));
|
||||
sub->connection = op->connection;
|
||||
sub->channelId = op->channelId;
|
||||
sub->nsteps = op->nsteps;
|
||||
sub->nbytes = op->nbytes;
|
||||
sub->peer = op->root;
|
||||
sub->reg = op->reg;
|
||||
sub->buffer = op->buffer;
|
||||
args->nsubs = subIndex+1;
|
||||
if (subIndex) {
|
||||
if ((args->sliceSteps != op->sliceSteps) ||
|
||||
(args->chunkSteps != op->chunkSteps) ||
|
||||
(args->protocol != op->protocol) ||
|
||||
(args->dtype != op->dtype) ||
|
||||
(args->redOp != op->redOp)) {
|
||||
(args->redOp != op->redOp) ||
|
||||
(args->coll != op->coll)) {
|
||||
WARN("Proxy append mismatch");
|
||||
return ncclInternalError;
|
||||
}
|
||||
@@ -386,6 +388,8 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
|
||||
args->redOp = op->redOp;
|
||||
args->pattern = op->pattern;
|
||||
args->protocol = op->protocol;
|
||||
args->coll = op->coll;
|
||||
args->specifics = op->specifics;
|
||||
args->state = ncclProxyOpReady;
|
||||
args->progress = op->connection->tcomm->proxyProgress;
|
||||
args->proxyAppendPtr = op->connection->proxyAppendPtr;
|
||||
@@ -590,7 +594,7 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
|
||||
|
||||
NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
|
||||
|
||||
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) {
|
||||
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op, int reg) {
|
||||
memset(op, 0, sizeof(struct ncclProxyOp));
|
||||
int channelId = info->channelId;
|
||||
struct ncclChannel* channel = info->comm->channels+channelId;
|
||||
@@ -611,15 +615,17 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
|
||||
op->pattern = ncclPatternSend;
|
||||
if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
|
||||
// Tune chunk size for the network
|
||||
if (info->count < stepSize) info->chunkSize /= 4;
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE && info->count < stepSize) info->chunkSize /= 4;
|
||||
else if (info->count < 8*stepSize) info->chunkSize /= 2;
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE && peer->send[1].proxyConn.sameProcess) op->reg = reg;
|
||||
}
|
||||
} else if (info->coll == ncclFuncRecv) {
|
||||
op->pattern = ncclPatternRecv;
|
||||
if (op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) {
|
||||
// Tune chunk size for the network
|
||||
if (info->count < stepSize) info->chunkSize /= 4;
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE && info->count < stepSize) info->chunkSize /= 4;
|
||||
else if (info->count < 8*stepSize) info->chunkSize /= 2;
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE && peer->recv[1].proxyConn.sameProcess) op->reg = reg;
|
||||
}
|
||||
} else {
|
||||
WARN("P2p operation is neither send or recv");
|
||||
@@ -628,17 +634,21 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
|
||||
if (ncclParamChunkSize() != 0) {
|
||||
info->chunkSize = ncclParamChunkSize();
|
||||
}
|
||||
op->buffer = op->reg ? info->recvbuff : NULL;
|
||||
op->chunkSize = info->chunkSize;
|
||||
op->nbytes = info->count;
|
||||
|
||||
// Compute nSteps for proxies
|
||||
int chunkEffectiveSize = op->chunkSize;
|
||||
if (op->protocol == NCCL_PROTO_LL) {
|
||||
chunkEffectiveSize /= 2;
|
||||
op->nbytes *= 2;
|
||||
op->nbytes = DIVUP(op->nbytes, sizeof(union ncclLLFifoLine)) * sizeof(union ncclLLFifoLine);
|
||||
}
|
||||
|
||||
op->nbytes = stepSize;
|
||||
if (!op->reg) op->nbytes = std::min(op->nbytes, (ssize_t)info->chunkSize);
|
||||
op->nsteps = DIVUP(info->count, chunkEffectiveSize);
|
||||
if (op->nsteps == 0) op->nsteps = 1;
|
||||
if (op->nsteps == 0 || op->reg) op->nsteps = 1;
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1069,35 +1079,60 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// cuMem API support
|
||||
// The response is sent out-of-band using ncclIpcSocket for this specific command
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
ncclResult_t res = ncclInProgress;
|
||||
// UDS support
|
||||
ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, int tpRank, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int *respFd) {
|
||||
ncclResult_t res = ncclSuccess;
|
||||
struct ncclIpcSocket ipcSock = { 0 };
|
||||
void *opId = (void*)((((uintptr_t)random()) << 32) | random());
|
||||
|
||||
// Create a UDS socket to receive the converted fd
|
||||
NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag));
|
||||
int rank = comm->topParentLocalRanks[comm->localRank];
|
||||
struct ncclProxyState* sharedProxyState = comm->proxyState;
|
||||
uint64_t pidHash = sharedProxyState->peerAddressesUDS[tpRank];
|
||||
|
||||
// Request the allocation of a UDS fd for the handle over sockets
|
||||
NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), 0, opId), ret, error);
|
||||
INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %p opId %p",
|
||||
comm, rank, tpRank, pidHash, reqSize, respSize, respFd, opId);
|
||||
|
||||
// Receive the converted fd over UDS
|
||||
NCCLCHECKGOTO(ncclIpcSocketRecvFd(&ipcSock, convertedFd), ret, error);
|
||||
TRACE(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx rank %d returned fd %d", *(uint64_t*)handle, proxyConn->tpLocalRank, *convertedFd);
|
||||
NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), ret, error);
|
||||
// cuMem: Create a UDS socket to receive the response
|
||||
NCCLCHECK(ncclIpcSocketInit(&ipcSock, rank, (uint64_t)opId, comm->abortFlag));
|
||||
|
||||
// Wait for proxy response (sockets)
|
||||
while (res == ncclInProgress) {
|
||||
res = ncclPollProxyResponse(comm, proxyConn, NULL, opId);
|
||||
}
|
||||
ncclIpcHdr hdr;
|
||||
hdr.type = type;
|
||||
hdr.rank = rank;
|
||||
hdr.reqSize = reqSize;
|
||||
hdr.respSize = respSize;
|
||||
hdr.opId = opId;
|
||||
assert(reqSize <= sizeof(hdr.data));
|
||||
memcpy(&hdr.data, reqBuff, reqSize);
|
||||
NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), -1, tpRank, pidHash), res, error);
|
||||
NCCLCHECKGOTO(ncclIpcSocketRecvMsg(&ipcSock, respBuff, respSize, respFd), res, error);
|
||||
NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), res, error);
|
||||
|
||||
INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %d opId %p - DONE",
|
||||
comm, rank, tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId);
|
||||
|
||||
return res;
|
||||
|
||||
error:
|
||||
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
|
||||
WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", tpRank, pidHash, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
// cuMem API support
|
||||
// The request/response is sent out-of-band using ncclIpcSocket for this specific command
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int tpRank, void *handle, int* convertedFd) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// Request the allocation of a UDS fd for the handle
|
||||
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, tpRank, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, convertedFd), ret, error);
|
||||
|
||||
// We have now received the converted fd over UDS
|
||||
INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d", *(uint64_t*)handle, tpRank, *convertedFd);
|
||||
|
||||
return ret;
|
||||
|
||||
error:
|
||||
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
|
||||
WARN("ncclProxyClientGetFd call to rank %d handle 0x%lx failed : %d", proxyConn->tpRank, *(uint64_t*)handle, ret);
|
||||
WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", tpRank, *(uint64_t*)handle, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1132,7 +1167,7 @@ error:
|
||||
ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) {
|
||||
struct ncclProxyState* sharedProxyState = comm->proxyState;
|
||||
// Receive the connection pointer from the Proxy
|
||||
if (*comm->abortFlag) {
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
|
||||
WARN("Comm %p is in abort state", comm);
|
||||
return ncclInternalError;
|
||||
}
|
||||
@@ -1287,13 +1322,13 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
|
||||
}
|
||||
|
||||
// cuMem API support
|
||||
static ncclResult_t proxyGetFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, uint64_t handle) {
|
||||
static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void *opId, uint64_t handle) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
// cuMem API support
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclIpcSocket ipcSock = { 0 };
|
||||
uint64_t hash = (uint64_t) opId;
|
||||
INFO(NCCL_PROXY, "UDS proxyGetFd received handle 0x%lx peer %d opId %lx", handle, peer->tpLocalRank, hash);
|
||||
INFO(NCCL_PROXY, "UDS proxyGetFd received handle 0x%lx peer %d opId %lx", handle, rank, hash);
|
||||
|
||||
CUmemAllocationHandleType type = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
int fd = -1;
|
||||
@@ -1301,7 +1336,7 @@ static ncclResult_t proxyGetFd(struct ncclProxyLocalPeer* peer, void *opId, stru
|
||||
CUCHECK(cuMemExportToShareableHandle(&fd, handle, type, 0));
|
||||
// Send back the converted fd using UDS
|
||||
NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, error);
|
||||
NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash), ret, error);
|
||||
NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, rank, hash), ret, error);
|
||||
error:
|
||||
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
|
||||
// We can now safely close the exported fd
|
||||
@@ -1326,11 +1361,8 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
|
||||
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels);
|
||||
if (op->connection->tcomm->proxySharedInit) res = op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels);
|
||||
__atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE);
|
||||
} else if (op->type == ncclProxyMsgGetFd) {
|
||||
uint64_t handle = *(uint64_t*)op->reqBuff;
|
||||
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgGetFd opId=%p op.reqBuff=%p handle=0x%lx", op->opId, op->reqBuff, handle);
|
||||
res = proxyGetFd(peer, op->opId, proxyState, handle); // cuMem API support
|
||||
} else if (op->type == ncclProxyMsgInit) {
|
||||
}
|
||||
else if (op->type == ncclProxyMsgInit) {
|
||||
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
|
||||
res = proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection);
|
||||
} else return ncclInternalError;
|
||||
@@ -1360,7 +1392,7 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
|
||||
(*asyncOpCount)--;
|
||||
return ncclSuccess;
|
||||
|
||||
} else if (*proxyState->abortFlag != 0) {
|
||||
} else if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED) != 0) {
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
@@ -1446,7 +1478,7 @@ void* ncclProxyService(void* _args) {
|
||||
/* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
|
||||
* connections. Need to wait until all other related comms call abort and safely exit
|
||||
* together, or we could face segmentation fault. */
|
||||
if (*proxyState->abortFlag != 0) stop = 1;
|
||||
if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED) != 0) stop = 1;
|
||||
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
|
||||
int ret;
|
||||
do {
|
||||
@@ -1563,13 +1595,71 @@ void* ncclProxyService(void* _args) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) {
|
||||
|
||||
// Process a request on the UDS socket
|
||||
static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd) {
|
||||
ncclIpcHdr hdr;
|
||||
NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), NULL));
|
||||
if (hdr.type == ncclProxyMsgGetFd) {
|
||||
// cuMem API support
|
||||
uint64_t handle = *(uint64_t*)hdr.data;
|
||||
INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle);
|
||||
return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle);
|
||||
}
|
||||
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
// UDS fd handle support
|
||||
void* ncclProxyServiceUDS(void* _args) {
|
||||
struct ncclProxyState* proxyState = (struct ncclProxyState*) _args;
|
||||
struct pollfd pollfds[1];
|
||||
|
||||
if (setProxyThreadContext(proxyState)) {
|
||||
INFO(NCCL_INIT, "[Proxy Service UDS] Created CUDA context on device %d", proxyState->cudaDev);
|
||||
} else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
|
||||
WARN("[Proxy Service UDS] Failed to set CUDA device %d", proxyState->cudaDev);
|
||||
}
|
||||
|
||||
if (ncclIpcSocketGetFd(&proxyState->ipcSock, &pollfds[0].fd) != ncclSuccess) {
|
||||
WARN("[Proxy Service UDS] Get listenSock fd fails");
|
||||
return NULL;
|
||||
};
|
||||
pollfds[0].events = POLLIN|POLLHUP;
|
||||
|
||||
while (1) {
|
||||
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
|
||||
int ret;
|
||||
do {
|
||||
ret = poll(pollfds, 1, 500);
|
||||
} while (ret < 0 && errno == EINTR);
|
||||
if (ret < 0) {
|
||||
WARN("[Proxy Service UDS] Poll failed: %s", strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Check for stop/abort
|
||||
if (proxyState->stop || *proxyState->abortFlag) break;
|
||||
|
||||
if (pollfds[0].revents) {
|
||||
// A request was seen on the UDS fd
|
||||
proxyUDSRecvReq(proxyState, pollfds[0].fd);
|
||||
}
|
||||
}
|
||||
|
||||
ncclIpcSocketClose(&proxyState->ipcSock);
|
||||
INFO(NCCL_PROXY, "[Proxy Service UDS] exit: stop %d abortFlag %d", proxyState->stop, *proxyState->abortFlag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS) {
|
||||
assert(comm->sharedRes->proxyState == NULL);
|
||||
NCCLCHECK(ncclCalloc(&comm->sharedRes->proxyState, 1));
|
||||
comm->proxyState = comm->sharedRes->proxyState;
|
||||
comm->proxyState->refCount = 1;
|
||||
comm->proxyState->listenSock = sock;
|
||||
comm->proxyState->peerAddresses = peerAddresses;
|
||||
comm->proxyState->peerAddressesUDS = peerAddressesUDS;
|
||||
// Seed the random number generator for UDS filename generation
|
||||
struct timeval time;
|
||||
gettimeofday(&time,NULL);
|
||||
@@ -1601,6 +1691,12 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
|
||||
|
||||
pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState);
|
||||
ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev);
|
||||
|
||||
// UDS support
|
||||
INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d pidHash %lx", comm, comm->rank, comm->peerInfo[comm->rank].pidHash);
|
||||
NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, comm->peerInfo[comm->rank].pidHash, comm->abortFlag));
|
||||
pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState);
|
||||
ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1610,8 +1706,13 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
|
||||
struct ncclProxyState* sharedProxyState = comm->proxyState;
|
||||
|
||||
if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
|
||||
if (comm->proxyState->threadUDS) {
|
||||
// UDS support
|
||||
comm->proxyState->stop = 1;
|
||||
}
|
||||
|
||||
if (sharedProxyState->peerAddresses) {
|
||||
if (*comm->abortFlag == 0) {
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0) {
|
||||
struct ncclSocket sock;
|
||||
int type = ncclProxyMsgStop;
|
||||
NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
|
||||
@@ -1636,7 +1737,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
|
||||
}
|
||||
}
|
||||
int type = ncclProxyMsgClose;
|
||||
if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)));
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0) NCCLCHECK(ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)));
|
||||
NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i));
|
||||
}
|
||||
}
|
||||
@@ -1652,6 +1753,7 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
|
||||
|
||||
assert(sharedProxyState->refCount == 0);
|
||||
free(sharedProxyState->peerAddresses);
|
||||
free(sharedProxyState->peerAddressesUDS);
|
||||
free(sharedProxyState->peerSocks);
|
||||
free(sharedProxyState->proxyOps);
|
||||
free(sharedProxyState->sharedDevMems);
|
||||
|
||||
@@ -0,0 +1,182 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "argcheck.h" // Need some checks here since we access comm
|
||||
#include "nccl.h"
|
||||
#include "comm.h"
|
||||
#include "net.h"
|
||||
#include "register.h"
|
||||
|
||||
ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
ncclDebugNoWarn = NCCL_NET;
|
||||
for (int d=0; d<reg->nDevs; d++) {
|
||||
if (reg->handles[d] != NULL) NCCLCHECK(comm->ncclNet->deregMr(cache->sComms[reg->devs[d]], reg->handles[d]));
|
||||
}
|
||||
reg->nDevs = 0;
|
||||
free(reg->handles);
|
||||
reg->handles = NULL;
|
||||
ncclDebugNoWarn = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
int netCount;
|
||||
NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
|
||||
if (netCount == 0) return ncclSuccess;
|
||||
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// Find local devices for p2p operations
|
||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
int dev;
|
||||
if (ncclTopoGetLocalNet(comm->topo, comm->rank, c, &dev) != ncclSuccess) goto end; // No local net
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECKGOTO(comm->ncclNet->getProperties(dev, &props), ret, end);
|
||||
if (props.regIsGlobal == 0) { // We need to be sure all NICs support global registration.
|
||||
reg->nDevs = 0;
|
||||
break;
|
||||
}
|
||||
int found = 0;
|
||||
for (int d=0; d<reg->nDevs; d++) if (reg->devs[d] == dev) found = 1;
|
||||
if (!found) reg->devs[reg->nDevs++] = dev;
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(®->handles, reg->nDevs), ret, end);
|
||||
|
||||
ncclDebugNoWarn = NCCL_NET;
|
||||
for (int d=0; d<reg->nDevs; d++) {
|
||||
int dev = reg->devs[d];
|
||||
reg->handles[d] = NULL;
|
||||
|
||||
if (cache->sComms[dev] == NULL) {
|
||||
// Create a loopback network comm object for that device to register the buffers.
|
||||
void *lComm = NULL;
|
||||
ncclNetHandle_t netHandle;
|
||||
bool connected = false;
|
||||
NCCLCHECKGOTO(comm->ncclNet->listen(dev, &netHandle, &lComm), ret, end);
|
||||
while (!connected) {
|
||||
if (*comm->abortFlag) {
|
||||
goto end;
|
||||
}
|
||||
if (cache->sComms[dev] == NULL)
|
||||
NCCLCHECKGOTO(comm->ncclNet->connect(dev, &netHandle, cache->sComms+dev, NULL), ret, end);
|
||||
if (cache->rComms[dev] == NULL)
|
||||
NCCLCHECKGOTO(comm->ncclNet->accept(lComm, cache->rComms+dev, NULL), ret, end);
|
||||
connected = (cache->rComms[dev] != NULL) && (cache->sComms[dev] != NULL);
|
||||
}
|
||||
NCCLCHECK(comm->ncclNet->closeListen(lComm));
|
||||
}
|
||||
if (comm->ncclNet->regMr(cache->sComms[dev], addr, size, NCCL_PTR_CUDA, reg->handles+d) != ncclSuccess) {
|
||||
reg->handles[d] = NULL;
|
||||
NCCLCHECK(ncclNetDeregister(comm, reg));
|
||||
reg->nDevs = 0;
|
||||
goto end;
|
||||
}
|
||||
}
|
||||
end:
|
||||
ncclDebugNoWarn = 0;
|
||||
if (ret != ncclSuccess) NCCLCHECK(ncclNetDeregister(comm, reg));
|
||||
return ret;
|
||||
}
|
||||
|
||||
ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
uintptr_t pageSize = cache->pageSize;
|
||||
uintptr_t addr = (uintptr_t)data & -pageSize;
|
||||
size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
|
||||
|
||||
*reg = NULL;
|
||||
for (int slot=0; /*true*/; slot++) {
|
||||
if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
|
||||
if ((addr >= cache->slots[slot]->addr) &&
|
||||
((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
|
||||
*reg = cache->slots[slot];
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
|
||||
|
||||
ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) {
|
||||
if (!ncclParamLocalRegister()) return ncclSuccess;
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
uintptr_t pageSize = cache->pageSize;
|
||||
uintptr_t addr = (uintptr_t)data & -pageSize;
|
||||
size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
|
||||
for (int slot=0; /*true*/; slot++) {
|
||||
if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
|
||||
if (cache->population == cache->capacity) { // must grow cache
|
||||
cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
|
||||
NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
|
||||
}
|
||||
memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
|
||||
NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
|
||||
struct ncclReg* regSlot = cache->slots[slot];
|
||||
regSlot->addr = addr;
|
||||
regSlot->pages = pages;
|
||||
regSlot->refs = 1;
|
||||
NCCLCHECK(ncclNetRegister(comm, (void*)addr, pages*pageSize, regSlot));
|
||||
regSlot->state |= NET_REG_COMPLETE;
|
||||
cache->population += 1;
|
||||
*handle = regSlot;
|
||||
return ncclSuccess;
|
||||
} else if ((addr >= cache->slots[slot]->addr) &&
|
||||
((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
|
||||
cache->slots[slot]->refs++;
|
||||
*handle = cache->slots[slot];
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
for (int i=0; i<cache->population; i++) {
|
||||
INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)cache->slots[i]->addr, cache->slots[i]->pages);
|
||||
NCCLCHECK(ncclNetDeregister(comm, cache->slots[i]));
|
||||
if (cache->slots[i]->state & NVLS_REG_COMPLETE) NCCLCHECK(ncclNvlsDeregBuffer(&cache->slots[i]->mcHandle, cache->slots[i]->regAddr, cache->slots[i]->dev, cache->slots[i]->regSize));
|
||||
free(cache->slots[i]);
|
||||
}
|
||||
free(cache->slots);
|
||||
for (int d=0; d<MAXCHANNELS; d++) {
|
||||
if (cache->sComms[d]) NCCLCHECK(comm->ncclNet->closeSend(cache->sComms[d]));
|
||||
if (cache->rComms[d]) NCCLCHECK(comm->ncclNet->closeRecv(cache->rComms[d]));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
|
||||
if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
|
||||
NCCLCHECK(ncclRegister(comm, buff, size, handle));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
|
||||
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
|
||||
struct ncclReg* reg = (struct ncclReg*)handle;
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
int slot;
|
||||
for (slot=0; slot<cache->population && cache->slots[slot] != reg; slot++);
|
||||
if (slot == cache->population) {
|
||||
WARN("Deregister: Could not find handle");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
if (--reg->refs) return ncclSuccess;
|
||||
NCCLCHECK(ncclNetDeregister(comm, reg));
|
||||
if (reg->state & NVLS_REG_COMPLETE) {
|
||||
NCCLCHECK(ncclNvlsDeregBuffer(®->mcHandle, reg->regAddr, reg->dev, reg->regSize));
|
||||
reg->regAddr = (CUdeviceptr)NULL;
|
||||
}
|
||||
free(reg);
|
||||
memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
|
||||
cache->population -= 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -324,10 +324,10 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
for (int r = 0; r < nranks; r++) {
|
||||
if (allConnects[r].isMaster) {
|
||||
memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect));
|
||||
if (r == rank) rankInCollNet = c;
|
||||
c++;
|
||||
}
|
||||
}
|
||||
if (isMaster) rankInCollNet = comm->node;
|
||||
} else { // send side : copy in connect info received from peer recv master
|
||||
if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
|
||||
}
|
||||
|
||||
+165
-99
@@ -86,8 +86,8 @@ struct connectMap {
|
||||
};
|
||||
|
||||
struct reqSlot {
|
||||
volatile void* recvBuff;
|
||||
volatile int size;
|
||||
bool turnIsSendNotRecv;
|
||||
int size;
|
||||
};
|
||||
|
||||
struct sendResources {
|
||||
@@ -243,9 +243,11 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
|
||||
struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
|
||||
send->conn.tail = &recvMem->tail;
|
||||
send->conn.sizesFifo = recvMem->sizesFifo;
|
||||
for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
|
||||
send->conn.offsFifo = recvMem->offsFifo;
|
||||
send->conn.connFifo = recvMem->connFifo;
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
send->conn.connFifo[i].size = -1;
|
||||
send->conn.connFifo[i].mode = NCCL_MODE_OFFSET;
|
||||
}
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
|
||||
send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
|
||||
@@ -274,7 +276,10 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
|
||||
void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
|
||||
recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
|
||||
recv->conn.offsFifo = recvMem->offsFifo;
|
||||
recv->conn.connFifo = recvMem->connFifo;
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
recv->conn.connFifo[i].mode = NCCL_MODE_OFFSET;
|
||||
}
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
|
||||
@@ -376,6 +381,8 @@ static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int
|
||||
|
||||
if (cuda && collNet->cudaBuff == NULL) {
|
||||
NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size));
|
||||
cudaMemset(collNet->cudaBuff, 0x33, *size/2);
|
||||
cudaMemset((char*)collNet->cudaBuff + *size/2, 0x66, *size/2);
|
||||
}
|
||||
if (!cuda && collNet->hostBuff == NULL) {
|
||||
NCCLCHECK(ncclCudaHostCalloc(&collNet->hostBuff, *size));
|
||||
@@ -471,7 +478,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
|
||||
resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
|
||||
// Don't give credits yet in shared mode.
|
||||
resources->sendMem->head = -NCCL_STEPS;
|
||||
(resources->gdcSync ? *resources->gdcSync : resources->sendMem->head) = -NCCL_STEPS;
|
||||
|
||||
// Allocate & Register shared buffers for the Simple protocol
|
||||
int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
|
||||
@@ -617,9 +624,49 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static size_t calcAlgoOffset(struct ncclProxyArgs* args, int isAllNotOne, int sub, uint64_t step) {
|
||||
int chunkSize = args->chunkSize;
|
||||
int nNodes = args->specifics.collnetDirect.nNodes;
|
||||
int node = args->specifics.collnetDirect.node;
|
||||
size_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
|
||||
size_t offset = (step*(args->nsubs) + sub)*chunkSize;
|
||||
if (isAllNotOne) {
|
||||
offset = std::min<size_t>(offset, nNodes*sizePerRank);
|
||||
} else {
|
||||
offset = std::max<size_t>(offset, (node+0)*sizePerRank);
|
||||
offset = std::min<size_t>(offset, (node+1)*sizePerRank);
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
#define LAST_OF_GROUP(s) \
|
||||
(s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1)
|
||||
static int calcRegionOffset(
|
||||
struct ncclProxyArgs* args, int isRecvNotSend, int sub, uint64_t step,
|
||||
int side // 0=begin, 1=end
|
||||
) {
|
||||
struct ncclCollNetSharedRes* collNet = args->subs[0].connection->collNet;
|
||||
int slotSize = collNet->buffSize/NCCL_STEPS;
|
||||
int chunkSize = args->chunkSize;
|
||||
int base = isRecvNotSend*NCCL_STEPS + (step%NCCL_STEPS);
|
||||
base *= collNet->nChannels*slotSize;
|
||||
if (args->coll == ncclFuncAllReduce) {
|
||||
return base + (sub+side)*chunkSize;
|
||||
} else {
|
||||
int isAllNotOne = isRecvNotSend ^ (args->coll == ncclFuncReduceScatter);
|
||||
int sub0 = sub - (sub%COLLNET_GROUP_NSUBS);
|
||||
size_t off = sub0*slotSize;
|
||||
off += calcAlgoOffset(args, isAllNotOne, sub+side, step)
|
||||
- calcAlgoOffset(args, isAllNotOne, sub0, step);
|
||||
return base + off;
|
||||
}
|
||||
}
|
||||
|
||||
#define LAST_OF_GROUP(args, s) \
|
||||
((s)%COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || (s) == (args)->nsubs-1)
|
||||
|
||||
static constexpr int calcStepsPerGroup(int nGroups) {
|
||||
//return NCCL_STEPS/nGroups;
|
||||
return NCCL_STEPS;
|
||||
}
|
||||
|
||||
static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
@@ -637,83 +684,117 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = NCCL_PROTO_SIMPLE;
|
||||
int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
|
||||
int perGroupSteps = NCCL_STEPS / nGroups;
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
|
||||
void* sendMhandle = resources->sendMhandles[p];
|
||||
void* recvMhandle = resources->recvMhandles[p];
|
||||
char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]);
|
||||
auto reqFifo = resources->reqFifo;
|
||||
int group = s/COLLNET_GROUP_NSUBS;
|
||||
int groupStart = s - (s%COLLNET_GROUP_NSUBS);
|
||||
|
||||
if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
int sharedBuffSlot = sub->posted%NCCL_STEPS;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset));
|
||||
resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize;
|
||||
resources->recvMem->connFifo[buffSlot].offset = calcRegionOffset(args, 0, s, sub->posted, 0);
|
||||
__sync_synchronize();
|
||||
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS));
|
||||
sub->posted += args->sliceSteps;
|
||||
*sendHead = sub->base + sub->posted - NCCL_STEPS;
|
||||
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
|
||||
}
|
||||
// Enforce sync between operations of the same group.
|
||||
bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->received == sub->received)) || (s && (sub-1)->received > sub->received));
|
||||
if (groupSync && sub->received < sub->posted && sub->received < sub->done + perGroupSteps) {
|
||||
if (sub->received < sub->posted && sub->received < sub->done + calcStepsPerGroup(nGroups)) {
|
||||
int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
|
||||
int sharedBuffSlot = sub->received%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->recvMem->sizesFifo;
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
volatile uint64_t* recvTail = &resources->recvMem->tail;
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]);
|
||||
if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)))) {
|
||||
// We have something to receive, let's check whether data is ready.
|
||||
int ready = 1;
|
||||
if (s == 0) {
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset));
|
||||
args->sharedBuff[sharedBuffSlot] = localBuff + offset;
|
||||
args->sharedSize[sharedBuffSlot] = args->chunkSize;
|
||||
}
|
||||
if (ready) {
|
||||
sizesFifo[buffSlot] = -1;
|
||||
sub->received += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
//continue;
|
||||
if (connFifo[buffSlot].size != -1 && ((*recvTail > (sub->base+sub->received)))) {
|
||||
if (args->coll != ncclFuncAllReduce) {
|
||||
int sendBeg = calcRegionOffset(args, 0, s, sub->received, 0);
|
||||
int sendEnd = calcRegionOffset(args, 0, s, sub->received, 1);
|
||||
if (sendEnd-sendBeg != connFifo[buffSlot].size) {
|
||||
WARN("CollNet sizes: want=%d got=%ld", sendEnd-sendBeg, connFifo[buffSlot].size);
|
||||
return ncclInternalError;
|
||||
}
|
||||
}
|
||||
connFifo[buffSlot].size = -1;
|
||||
sub->received += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
}
|
||||
}
|
||||
if (LAST_OF_GROUP(s) && (sub->transmitted < sub->received)) {
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
|
||||
if (reqFifo[group][buffSlot].recvBuff != NULL) {
|
||||
int totalSize = (s-group*COLLNET_GROUP_NSUBS+1) * args->sharedSize[sharedBuffSlot];
|
||||
int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype);
|
||||
reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot];
|
||||
char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot];
|
||||
NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
|
||||
if (sub->requests[buffSlot] == NULL) continue;
|
||||
// Enforce collective ordering of collnet ops.
|
||||
bool ordered = s==0 ? args->subs[args->nsubs-1].transmitted == sub->transmitted
|
||||
: sub->transmitted < (sub-1)->transmitted;
|
||||
if (ordered && (sub->transmitted < sub->received)) {
|
||||
if (LAST_OF_GROUP(args, s)) {
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
if (!reqFifo[group][buffSlot].turnIsSendNotRecv) continue;
|
||||
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
sub->transmitted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
continue;
|
||||
ssize_t sizePerRank = 0;
|
||||
size_t allBeg = calcAlgoOffset(args, 1, groupStart, sub->transmitted);
|
||||
size_t allEnd = calcAlgoOffset(args, 1, s+1, sub->transmitted);
|
||||
int sendBeg = calcRegionOffset(args, 0, groupStart, sub->transmitted, 0);
|
||||
int sendEnd = calcRegionOffset(args, 0, s, sub->transmitted, 1);
|
||||
int recvBeg = calcRegionOffset(args, 1, groupStart, sub->transmitted, 0);
|
||||
int recvEnd = calcRegionOffset(args, 1, s, sub->transmitted, 1);
|
||||
reqFifo[group][buffSlot].size = recvEnd - recvBeg;
|
||||
size_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
|
||||
|
||||
if (sendBeg==sendEnd && recvBeg==recvEnd) {
|
||||
sub->requests[buffSlot] = nullptr; // trivally finished request
|
||||
} else {
|
||||
if (args->coll == ncclFuncAllReduce) {
|
||||
int count = (sendEnd-sendBeg)/eltSize;
|
||||
NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region+sendBeg, region+recvBeg, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
|
||||
} else {
|
||||
sizePerRank = args->specifics.collnetDirect.sizePerRank;
|
||||
if (args->coll == ncclFuncAllGather) {
|
||||
ncclNetSGE_v8_t recvParts;
|
||||
recvParts.mhandle = recvMhandle;
|
||||
recvParts.address = region + recvBeg;
|
||||
recvParts.size = allEnd - allBeg;
|
||||
NCCLCHECK(proxyState->ncclCollNet->iallgather(
|
||||
resources->collNetComm, region+sendBeg, 1, &recvParts,
|
||||
sizePerRank, allBeg, allEnd-allBeg,
|
||||
sendMhandle, sub->requests+buffSlot));
|
||||
} else {
|
||||
ncclNetSGE_v8_t sendParts;
|
||||
sendParts.mhandle = sendMhandle;
|
||||
sendParts.address = region + sendBeg;
|
||||
sendParts.size = allEnd - allBeg;
|
||||
NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
|
||||
resources->collNetComm, 1, &sendParts, region+recvBeg,
|
||||
sizePerRank, allBeg, allEnd-allBeg,
|
||||
(ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
|
||||
recvMhandle, sub->requests+buffSlot));
|
||||
}
|
||||
}
|
||||
if (sub->requests[buffSlot] == nullptr) continue;
|
||||
|
||||
if (args->coll == ncclFuncAllReduce) {
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallreduce posted, size %d req %p", (long)sub->transmitted, group, buffSlot, int(sendEnd-sendBeg), sub->requests[buffSlot]);
|
||||
} else if (args->coll == ncclFuncAllGather) {
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallgather posted sendSize=%ld recvOffset=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(sizePerRank), long(allBeg), long(allEnd-allBeg), sub->requests[buffSlot]);
|
||||
} else {
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Ireducescatter posted sendOffset=%ld sendSize=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(allBeg), long(allEnd-allBeg), long(sizePerRank), sub->requests[buffSlot]);
|
||||
}
|
||||
}
|
||||
}
|
||||
sub->transmitted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
continue;
|
||||
}
|
||||
// Check whether the network has completed some send operations.
|
||||
if (LAST_OF_GROUP(s) && sub->done < sub->transmitted) {
|
||||
if (LAST_OF_GROUP(args, s) && sub->done < sub->transmitted) {
|
||||
int done, size;
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
||||
NCCLCHECK(proxyState->ncclCollNet->test((void*)(sub->requests[buffSlot]), &done, &size));
|
||||
done = 1;
|
||||
if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test((void*)(sub->requests[buffSlot]), &done, &size));
|
||||
if (done) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size);
|
||||
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
|
||||
// (reordered store after store is possible on POWER, though not on x86)
|
||||
__sync_synchronize();
|
||||
reqFifo[group][buffSlot].recvBuff = NULL; // Notify recvProxy
|
||||
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].done += args->sliceSteps;
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] request %p done, size %d", (long)sub->done, group, buffSlot, sub->requests[buffSlot], size);
|
||||
sub->requests[buffSlot] = nullptr;
|
||||
reqFifo[group][buffSlot].turnIsSendNotRecv = false; // Notify recvProxy
|
||||
for (int i=groupStart; i<=s; i++) args->subs[i].done += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
int allDone = 1;
|
||||
for (int i=0; i<args->nsubs; i++) {
|
||||
@@ -721,7 +802,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
}
|
||||
if (allDone) {
|
||||
args->state = ncclProxyOpNone;
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] stopped", sub->done, s);
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] stopped", (long)sub->done, s);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -739,6 +820,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
sub->base = ROUNDUP(resources->step, args->chunkSteps);
|
||||
sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0;
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
memset(sub->requests, 0, sizeof(sub->requests));
|
||||
}
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
@@ -746,38 +828,32 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = NCCL_PROTO_SIMPLE;
|
||||
int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
|
||||
int perGroupSteps = NCCL_STEPS / nGroups;
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
int group = s/COLLNET_GROUP_NSUBS;
|
||||
int groupStart = s - (s%COLLNET_GROUP_NSUBS);
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
|
||||
void* mhandle = resources->mhandles[p];
|
||||
auto reqFifo = resources->reqFifo;
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
|
||||
// Enforce sync between operations of the same group.
|
||||
if (LAST_OF_GROUP(s) && (sub->posted < sub->done + perGroupSteps) && (sub->posted < sub->nsteps)) {
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
if (LAST_OF_GROUP(args, s) && (sub->posted < sub->done + calcStepsPerGroup(nGroups)) && (sub->posted < sub->nsteps)) {
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
int sharedBuffSlot = sub->posted%NCCL_STEPS;
|
||||
int startChannel = group*COLLNET_GROUP_NSUBS;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
|
||||
reqFifo[group][buffSlot].recvBuff = localBuff + offset;
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff);
|
||||
reqFifo[group][buffSlot].turnIsSendNotRecv = true;
|
||||
TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] posted buffer", (long)sub->posted, group, buffSlot);
|
||||
sub->posted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
continue;
|
||||
}
|
||||
if (LAST_OF_GROUP(s) && (sub->posted > sub->received)) {
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
if (LAST_OF_GROUP(args, s) && (sub->received < sub->posted)) {
|
||||
int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
|
||||
int sharedBuffSlot = sub->received%NCCL_STEPS;
|
||||
if (reqFifo[group][buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete
|
||||
args->sharedSize[sharedBuffSlot] = reqFifo[group][buffSlot].size;
|
||||
int totalSize = args->sharedSize[sharedBuffSlot]*(s-group*COLLNET_GROUP_NSUBS+1);
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize);
|
||||
if (!reqFifo[group][buffSlot].turnIsSendNotRecv) { // Buffer is cleared : coll is complete
|
||||
int recvBeg = calcRegionOffset(args, 1, groupStart, sub->received, 0);
|
||||
int recvEnd = calcRegionOffset(args, 1, s, sub->received, 1);
|
||||
int totalSize = recvEnd - recvBeg;
|
||||
TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %d chunkSize=%d", (long)sub->received, group, buffSlot, totalSize, args->chunkSize);
|
||||
sub->received += args->sliceSteps;
|
||||
sub->requests[buffSlot] = NULL;
|
||||
if (reqFifo[group][buffSlot].size > 0 && resources->useGdr && resources->needFlush) {
|
||||
// GDRCOPY support
|
||||
if (resources->gdcFlush) {
|
||||
@@ -788,42 +864,31 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
WARN("NET: GDR Flush only supported on x86_64");
|
||||
return ncclInternalError;
|
||||
#endif
|
||||
sub->requests[buffSlot] = NULL;
|
||||
} else {
|
||||
int startChannel = group*COLLNET_GROUP_NSUBS;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
|
||||
NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
|
||||
NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region+recvBeg, totalSize, mhandle, sub->requests+buffSlot));
|
||||
}
|
||||
} else {
|
||||
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
|
||||
}
|
||||
args->idle = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (LAST_OF_GROUP(s) && (sub->received > sub->flushed)) {
|
||||
if (LAST_OF_GROUP(args, s) && (sub->flushed < sub->received)) {
|
||||
// Progress flush operations
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS;
|
||||
int done = 1;
|
||||
if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test(sub->requests[buffSlot], &done, NULL));
|
||||
if (done) {
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d/%d] flushed", sub->flushed, group, buffSlot);
|
||||
sub->requests[buffSlot] = nullptr;
|
||||
TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] flushed", (long)sub->flushed, group, buffSlot);
|
||||
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
//continue;
|
||||
}
|
||||
}
|
||||
if (sub->flushed > sub->transmitted) {
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
if (sub->transmitted < sub->flushed) {
|
||||
int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
|
||||
int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
|
||||
int startChannel = group*COLLNET_GROUP_NSUBS;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
|
||||
volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
|
||||
offsFifo[buffSlot] = offset + (s%COLLNET_GROUP_NSUBS)*args->chunkSize;
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
connFifo[buffSlot].offset = calcRegionOffset(args, 1, s, sub->transmitted, 0);
|
||||
__sync_synchronize();
|
||||
volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
|
||||
*recvTail = sub->base + sub->flushed;
|
||||
@@ -835,14 +900,15 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
// Enforce sync here to make sure the last sub doesn't increase "done" before all others in the group have
|
||||
// reached the same point, otherwise we would start posting buffers to the send proxy before we're done
|
||||
// processing all the shared buffer.
|
||||
bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->done == sub->done)) || (s && (sub-1)->done > sub->done));
|
||||
bool groupSync = s==0 ? args->subs[args->nsubs-1].done == sub->done
|
||||
: (sub-1)->done > sub->done;
|
||||
volatile uint64_t* sendHead = &resources->sendMem->head;
|
||||
if (groupSync && sub->done < sub->transmitted && (sub->base+sub->done) < *sendHead) {
|
||||
sub->done += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
if (sub->done == sub->nsteps && s == args->nsubs-1) {
|
||||
args->state = ncclProxyOpNone;
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d] stopped", sub->done, s);
|
||||
TRACE(NCCL_NET, "recvProxy [%ld/%d] stopped", (long)sub->done, s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+123
-43
@@ -347,9 +347,12 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
|
||||
struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
|
||||
send->conn.tail = &recvMem->tail;
|
||||
send->conn.sizesFifo = recvMem->sizesFifo;
|
||||
send->conn.connFifo = recvMem->connFifo;
|
||||
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
|
||||
send->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL;
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
send->conn.connFifo[i].offset = -1;
|
||||
recvMem->connFifo[i].mode = map->shared ? NCCL_MODE_OFFSET : NCCL_MODE_NORMAL;
|
||||
}
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
|
||||
send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
|
||||
@@ -409,9 +412,11 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
|
||||
void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
|
||||
recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
|
||||
recv->conn.sizesFifo = recvMem->sizesFifo;
|
||||
recv->conn.connFifo = recvMem->connFifo;
|
||||
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
|
||||
recv->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL;
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
recvMem->connFifo[i].mode = map->shared ? NCCL_MODE_OFFSET : NCCL_MODE_NORMAL;
|
||||
}
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
|
||||
recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
|
||||
@@ -510,10 +515,11 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset) {
|
||||
static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset, int* size) {
|
||||
// Use different pools for different channels and also separate send/recv.
|
||||
int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
|
||||
*offset = proxyState->p2pChunkSize * globalSlot;
|
||||
if (size) *size = proxyState->p2pChunkSize;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -752,8 +758,9 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
|
||||
|
||||
// Don't give credits yet in shared mode.
|
||||
resources->sendMem->head = map->shared ? -NCCL_STEPS : 0;
|
||||
for (int i=0; i<NCCL_STEPS; i++) resources->recvMem->sizesFifo[i] = -1;
|
||||
(resources->gdcSync ? *resources->gdcSync : resources->sendMem->head) =
|
||||
(map->shared ? -NCCL_STEPS : 0);
|
||||
for (int i=0; i<NCCL_STEPS; i++) resources->recvMem->connFifo[i].size = -1;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
|
||||
@@ -1014,6 +1021,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
|
||||
}
|
||||
|
||||
static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
|
||||
#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
|
||||
|
||||
static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
@@ -1022,8 +1030,15 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources);
|
||||
// Round to next multiple of sliceSteps
|
||||
sub->base = ROUNDUP(resources->step, args->chunkSteps);
|
||||
// Set step base for next op
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
sub->posted = sub->transmitted = sub->done = 0;
|
||||
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
|
||||
if (sub->reg && sub->nbytes > 0) {
|
||||
NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->buffer, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
|
||||
} else {
|
||||
sub->mhandle = resources->mhandles[args->protocol];
|
||||
}
|
||||
}
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
@@ -1035,23 +1050,24 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
if (sub->done == sub->nsteps) continue;
|
||||
struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources);
|
||||
void* mhandle = resources->mhandles[p];
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
int stepSize = resources->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
int buffSize = stepSize*args->sliceSteps;
|
||||
if (sub->nbytes < buffSize) buffSize = sub->nbytes;
|
||||
// Post buffers to the GPU
|
||||
if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
if (resources->shared) {
|
||||
int sharedBuffSlot = sub->posted%maxDepth;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset));
|
||||
resources->recvMem->offsFifo[buffSlot] = offset;
|
||||
__sync_synchronize();
|
||||
if (!sub->reg) {
|
||||
int sharedBuffSlot = sub->posted%maxDepth;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset, NULL));
|
||||
resources->recvMem->connFifo[buffSlot].offset = offset;
|
||||
__sync_synchronize();
|
||||
}
|
||||
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
|
||||
sub->posted += args->sliceSteps;
|
||||
*sendHead = sub->base + sub->posted - NCCL_STEPS;
|
||||
// Only post one credit for registered buffer
|
||||
if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
|
||||
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
|
||||
} else sub->posted += args->sliceSteps;
|
||||
for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) {
|
||||
@@ -1063,13 +1079,13 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
// Check whether we received data from the GPU and send it to the network
|
||||
if (sub->transmitted < sub->posted && sub->transmitted < sub->done + NCCL_STEPS) {
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->recvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->recvMem->tail;
|
||||
if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
|
||||
uint64_t tail = sub->base + (sub->reg ? 0 : sub->transmitted);
|
||||
if ((sub->reg || connFifo[buffSlot].size != -1) && ((*recvTail > tail) || p == NCCL_PROTO_LL)) {
|
||||
// We have something to receive, let's check if it's completely ready.
|
||||
int size = sizesFifo[buffSlot];
|
||||
int size = sub->reg ? std::min(MAX_NET_SIZE, sub->nbytes) : connFifo[buffSlot].size;
|
||||
bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared;
|
||||
char* buff = shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
|
||||
char* buff = shared ? localBuff+connFifo[buffSlot].offset : localBuff+buffSlot*stepSize;
|
||||
int ready = 1;
|
||||
if (p == NCCL_PROTO_LL128) {
|
||||
ready = resources->useGdr;
|
||||
@@ -1077,7 +1093,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
// When data is in sysmem, we need to wait until all flags are correct since the GPU only
|
||||
// called threadfence()
|
||||
uint64_t flag = sub->base+sub->transmitted+1;
|
||||
int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
|
||||
int nFifoLines = DIVUP(connFifo[buffSlot].size, sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
|
||||
volatile uint64_t* lines = (volatile uint64_t*)buff;
|
||||
ready = 1;
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
@@ -1093,15 +1109,14 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
volatile uint32_t *f2 = &lines[i].flag2;
|
||||
if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
|
||||
}
|
||||
} else if (p == NCCL_PROTO_SIMPLE && resources->shared) {
|
||||
buff = sub->reg ? (char*)sub->buffer : localBuff+resources->recvMem->connFifo[buffSlot].offset;
|
||||
}
|
||||
if (ready) {
|
||||
// Data is ready, try to send.
|
||||
NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, mhandle, sub->requests+buffSlot));
|
||||
NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot));
|
||||
if (sub->requests[buffSlot] != NULL) {
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
|
||||
sizesFifo[buffSlot] = -1;
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p, size %d, proto %d, myRank %d, channelId %d", sub->transmitted, buffSlot, sub->requests[buffSlot], size, p, proxyState->tpRank, sub->channelId);
|
||||
sub->transmitted += args->sliceSteps;
|
||||
for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait);
|
||||
args->idle = 0;
|
||||
@@ -1113,21 +1128,43 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
// Check whether the network has completed some send operations.
|
||||
if (sub->done < sub->transmitted) {
|
||||
int done;
|
||||
int size;
|
||||
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
||||
NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, NULL));
|
||||
NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, &size));
|
||||
if (done) {
|
||||
if (sub->reg) {
|
||||
if (size < sub->nbytes) {
|
||||
sub->buffer = ((char*)sub->buffer)+size;
|
||||
sub->nbytes -= size;
|
||||
// Do one more step (at least)
|
||||
sub->nsteps++;
|
||||
} else {
|
||||
// Signal the GPU the send is complete and it can return.
|
||||
connFifo[sub->base%NCCL_STEPS].size = -1;
|
||||
}
|
||||
}
|
||||
// Make sure size is reset to -1 before we update the head.
|
||||
if (sub->reg == 0) connFifo[buffSlot].size = -1;
|
||||
__sync_synchronize();
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
|
||||
sub->done += args->sliceSteps;
|
||||
for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
|
||||
|
||||
if (resources->shared == 0) {
|
||||
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
|
||||
*sendHead = sub->base + sub->done;
|
||||
if (sub->reg) {
|
||||
// We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
|
||||
if (sub->done == sub->nsteps) *sendHead = sub->base + args->sliceSteps;
|
||||
} else {
|
||||
*sendHead = sub->base + sub->done;
|
||||
}
|
||||
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
|
||||
}
|
||||
args->idle = 0;
|
||||
if (sub->done == sub->nsteps) {
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
if (sub->reg && sub->nbytes > 0) {
|
||||
NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, sub->mhandle));
|
||||
}
|
||||
args->done++;
|
||||
}
|
||||
}
|
||||
@@ -1171,9 +1208,17 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
recvComm = resources->netRecvComm;
|
||||
// Round to next multiple of sliceSteps
|
||||
sub->base = ROUNDUP(resources->step, args->chunkSteps);
|
||||
// Set step base for next op
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
sub->posted = sub->received = sub->transmitted = sub->done = 0;
|
||||
for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
|
||||
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
|
||||
if (sub->reg && sub->nbytes > 0) {
|
||||
// Register buffer
|
||||
NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->buffer, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
|
||||
} else {
|
||||
sub->mhandle = resources->mhandles[args->protocol];
|
||||
}
|
||||
}
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
@@ -1188,29 +1233,37 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
int sizes[NCCL_PROXY_MAX_SUBS];
|
||||
int tags[NCCL_PROXY_MAX_SUBS];
|
||||
void* mhandles[NCCL_PROXY_MAX_SUBS];
|
||||
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup + i;
|
||||
if (sub->posted < sub->nsteps) {
|
||||
if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
if (sub->reg) maxDepth = 1;
|
||||
int stepSize = resources->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
if (p == NCCL_PROTO_SIMPLE && resources->shared) {
|
||||
int sharedBuffSlot = sub->posted%maxDepth;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
|
||||
volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
|
||||
offsFifo[buffSlot] = offset;
|
||||
ptrs[subCount] = localBuff+offset;
|
||||
if (sub->reg) {
|
||||
// Wait until CUDA kernel has started before we access the user buffer directly.
|
||||
if (connFifo[sub->base%NCCL_STEPS].size == -1) continue;
|
||||
ptrs[subCount] = sub->buffer;
|
||||
sizes[subCount] = std::min(MAX_NET_SIZE, sub->nbytes);
|
||||
} else {
|
||||
int sharedBuffSlot = sub->posted%maxDepth;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset, sizes+subCount));
|
||||
connFifo[buffSlot].offset = offset;
|
||||
ptrs[subCount] = localBuff+offset;
|
||||
}
|
||||
} else {
|
||||
ptrs[subCount] = localBuff+buffSlot*stepSize;
|
||||
sizes[subCount] = stepSize*args->sliceSteps;
|
||||
}
|
||||
sizes[subCount] = stepSize*args->sliceSteps;
|
||||
if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
|
||||
tags[subCount] = resources->tpRemoteRank;
|
||||
mhandles[subCount] = resources->mhandles[p];
|
||||
mhandles[subCount] = sub->mhandle;
|
||||
subCount++;
|
||||
}
|
||||
}
|
||||
@@ -1246,9 +1299,27 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (done) {
|
||||
int needFlush = 0;
|
||||
int totalSize = 0;
|
||||
int subIndex = 0;
|
||||
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup + i;
|
||||
if (sub->received < sub->nsteps) {
|
||||
int size = sizes[subIndex++];
|
||||
if (sub->reg) {
|
||||
if (size < sub->nbytes) {
|
||||
sub->buffer = ((char*)sub->buffer) + size;
|
||||
sub->nbytes -= size;
|
||||
// Do one more step (at least)
|
||||
sub->nsteps++;
|
||||
} else {
|
||||
// Reset connFifo size indicating the GPU was ready to receive.
|
||||
// There is a __sync_synchronize() later to ensure it is reset before it is set again by the GPU.
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
connFifo[sub->base%NCCL_STEPS].size = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
sub->received += args->sliceSteps;
|
||||
for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
|
||||
if (step < sub->nsteps) {
|
||||
@@ -1276,9 +1347,11 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
int stepSize = resources->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
ptrs[subCount] = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
|
||||
mhandles[subCount] = resources->mhandles[p];
|
||||
int buffSlot = (sub->base+sub->received-args->sliceSteps)%NCCL_STEPS;
|
||||
ptrs[subCount] = resources->shared ?
|
||||
(sub->reg ? sub->buffer : localBuff+resources->recvMem->connFifo[buffSlot].offset) :
|
||||
localBuff+buffSlot*stepSize;
|
||||
mhandles[subCount] = sub->mhandle;
|
||||
subCount++;
|
||||
}
|
||||
}
|
||||
@@ -1302,13 +1375,18 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (done) {
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup + i;
|
||||
|
||||
sub->transmitted += args->sliceSteps;
|
||||
for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait);
|
||||
if (step < sub->nsteps) {
|
||||
__sync_synchronize();
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
|
||||
*recvTail = sub->base + sub->transmitted;
|
||||
if (sub->reg) {
|
||||
// We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
|
||||
if (sub->transmitted == sub->nsteps) *recvTail = sub->base + args->sliceSteps;
|
||||
} else
|
||||
*recvTail = sub->base + sub->transmitted;
|
||||
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
|
||||
}
|
||||
}
|
||||
@@ -1326,7 +1404,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (sub->transmitted > sub->done) {
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
volatile uint64_t* sendHead = &resources->sendMem->head;
|
||||
uint64_t done = *sendHead;
|
||||
uint64_t done = sub->reg ? sub->base + sub->nsteps : *sendHead;
|
||||
while (done > sub->base + sub->done &&
|
||||
// LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
|
||||
sub->transmitted > sub->done) {
|
||||
@@ -1341,7 +1419,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
args->idle = 0;
|
||||
if (sub->done == sub->nsteps) {
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
if (sub->reg && sub->nbytes > 0) {
|
||||
NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, sub->mhandle));
|
||||
}
|
||||
args->done++;
|
||||
break;
|
||||
}
|
||||
|
||||
+809
-420
Failā izmaiņas netiks attēlotas, jo tās ir par lielu
Ielādēt izmaiņas
@@ -96,6 +96,7 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
|
||||
props->pciPath = ncclNetSocketDevs[dev].pciPath;
|
||||
props->guid = dev;
|
||||
props->ptrSupport = NCCL_PTR_HOST;
|
||||
props->regIsGlobal = 0;
|
||||
NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed));
|
||||
props->latency = 0; // Not set
|
||||
props->port = 0;
|
||||
@@ -534,7 +535,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
|
||||
ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
|
||||
}
|
||||
ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
|
||||
|
||||
+107
-119
@@ -11,6 +11,7 @@
|
||||
#include "utils.h"
|
||||
#include "proxy.h"
|
||||
#include "enqueue.h"
|
||||
#include "register.h"
|
||||
|
||||
#if CUDART_VERSION >= 12010
|
||||
|
||||
@@ -20,19 +21,8 @@ struct graphRegData {
|
||||
};
|
||||
|
||||
struct localRegData {
|
||||
/* Registration record data */
|
||||
uintptr_t recSendbuff, recRecvbuff;
|
||||
intptr_t recSendOffset, recRecvOffset;
|
||||
/* Registration request data */
|
||||
uintptr_t reqSendbuff, reqRecvbuff;
|
||||
size_t reqSendSize, reqRecvSize;
|
||||
intptr_t reqSendOffset, reqRecvOffset;
|
||||
};
|
||||
|
||||
struct localRequestData {
|
||||
uintptr_t reqBuff;
|
||||
size_t reqSize;
|
||||
intptr_t reqOffset;
|
||||
struct ncclReg reg;
|
||||
intptr_t offset;
|
||||
};
|
||||
|
||||
ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
@@ -116,11 +106,9 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int
|
||||
// cuMem UDS support
|
||||
int fd = -1;
|
||||
TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle %p from rank %d", comm->localRank, shareableHandle, rank);
|
||||
struct ncclProxyConnector proxyConn;
|
||||
int tpProxyRank = comm->topParentRanks[rank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &proxyConn));
|
||||
TRACE(NCCL_NVLS, "NVLS rank %d request conversion of handle 0x%lx from rank %d", comm->localRank, *(uint64_t*)shareableHandle, rank);
|
||||
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, &proxyConn, shareableHandle, &fd));
|
||||
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpProxyRank, shareableHandle, &fd));
|
||||
TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
|
||||
CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type));
|
||||
(void) close(fd);
|
||||
@@ -248,7 +236,8 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
|
||||
|
||||
int gpuCount;
|
||||
NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
|
||||
if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess;
|
||||
// NVLS is not supported on MNNVL yet
|
||||
if (!ncclParamNvlsEnable() || gpuCount <= 2 || comm->nNodes > 1 || comm->MNNVL) return ncclSuccess;
|
||||
|
||||
CUdevice dev;
|
||||
int driverVersion;
|
||||
@@ -292,14 +281,14 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
if (nvlsShare) {
|
||||
/* reuse NVLS resources */
|
||||
comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
|
||||
for (int c = 0; c < comm->nvlsChannels; c++) {
|
||||
for (int c = 0; c < comm->nChannels; c++) {
|
||||
NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, cleanup);
|
||||
}
|
||||
|
||||
comm->nvlsResources = parent->nvlsResources;
|
||||
ncclAtomicRefCountIncrement(&parent->nvlsResources->refCount);
|
||||
} else {
|
||||
int nChannels;
|
||||
int nChannels = comm->nChannels;
|
||||
struct ncclNvlsSharedRes* resources;
|
||||
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
@@ -312,7 +301,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
|
||||
}
|
||||
|
||||
nChannels = resources->nChannels = comm->nvlsChannels;
|
||||
resources->nChannels = comm->nvlsChannels;
|
||||
for (int c = 0; c < nChannels; c++) {
|
||||
NCCLCHECK(initNvlsChannel(comm, c, parent, false));
|
||||
}
|
||||
@@ -390,7 +379,8 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
}
|
||||
|
||||
/* create shared memory for fast NVLS buffer registration */
|
||||
typeSize = sizeof(struct localRegData);
|
||||
typeSize = sizeof(struct localRegData) << 1;
|
||||
|
||||
if (comm->localRank == 0) {
|
||||
shmPath[0] = '\0';
|
||||
NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, cleanup);
|
||||
@@ -405,6 +395,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
comm->nvlsResources->nvlsShmem.cnt[1] = (size_t*)((char*)comm->nvlsResources->nvlsShmem.ptr[0] + typeSize * comm->localRanks);
|
||||
comm->nvlsResources->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[1] + sizeof(size_t));
|
||||
comm->nvlsResources->nvlsShmem.round = 0;
|
||||
comm->nvlsResources->nvlsShmem.maxTypeSize = typeSize;
|
||||
|
||||
return res;
|
||||
|
||||
@@ -427,23 +418,59 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t tryRegisterBuffer(struct ncclComm *comm, struct localRequestData *reqData, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) {
|
||||
ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclRegRecord *regRecord = NULL;
|
||||
struct localRequestData *myReqData = &reqData[comm->localRank];
|
||||
struct ncclReg *regRecord = NULL;
|
||||
CUdeviceptr regPtr = 0;
|
||||
CUmulticastObjectProp prop;
|
||||
char shareableHandle[NVLS_HANDLE_SIZE];
|
||||
CUmemGenericAllocationHandle mcHandle;
|
||||
size_t granularity;
|
||||
size_t minSize;
|
||||
size_t minSize = SIZE_MAX;
|
||||
bool localRegBufUsed = false;
|
||||
struct localRegData* regData = NULL;
|
||||
cudaPointerAttributes attr;
|
||||
|
||||
/* get minimal size of nvls buffers */
|
||||
minSize = reqData[0].reqSize;
|
||||
for (int i = 1; i < comm->localRanks; ++i) {
|
||||
if (minSize > reqData[i].reqSize)
|
||||
minSize = reqData[i].reqSize;
|
||||
NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks), ret, fail);
|
||||
|
||||
if (userBuff) {
|
||||
NCCLCHECKGOTO(ncclRegFind(comm, (void*)userBuff, buffSize, ®Record), ret, fail);
|
||||
if (regRecord) {
|
||||
CUDACHECK(cudaPointerGetAttributes(&attr, (void*)regRecord->addr));
|
||||
if (attr.type == cudaMemoryTypeDevice) {
|
||||
size_t regSize = regRecord->pages * comm->regCache.pageSize;
|
||||
prop = comm->nvlsResources->properties;
|
||||
prop.size = regSize;
|
||||
CUCHECK(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
|
||||
CUCHECK(cuMemGetAddressRange((CUdeviceptr*)®Record->baseAddr, ®Record->baseSize, (CUdeviceptr)regRecord->addr));
|
||||
if (regSize % granularity == 0) {
|
||||
regRecord->regSize = regSize;
|
||||
} else {
|
||||
regRecord->regSize = regRecord->baseSize - (regRecord->addr - regRecord->baseAddr);
|
||||
}
|
||||
|
||||
if (regRecord->addr % comm->nvlsResources->ucGran == 0 && regRecord->regSize % granularity == 0) {
|
||||
regRecord->state |= NVLS_REG_POSSIBLE;
|
||||
memcpy(®Data[comm->localRank].reg, regRecord, sizeof(struct ncclReg));
|
||||
regData[comm->localRank].offset = userBuff - regRecord->addr;
|
||||
}
|
||||
}
|
||||
|
||||
if ((regRecord->state & NVLS_REG_POSSIBLE) == 0) {
|
||||
regRecord->state |= NVLS_REG_NO_SUPPORT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank, regData, sizeof(struct localRegData)), ret, fail);
|
||||
|
||||
for (int i = 0; i < comm->localRanks; ++i) {
|
||||
if ((regData[i].reg.state & NVLS_REG_POSSIBLE) == 0) {
|
||||
goto fail;
|
||||
}
|
||||
/* get minimal reg size of nvls buffers */
|
||||
if (minSize > regData[i].reg.regSize)
|
||||
minSize = regData[i].reg.regSize;
|
||||
}
|
||||
|
||||
/* start registration */
|
||||
@@ -459,7 +486,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, struct localRequestData *r
|
||||
}
|
||||
|
||||
CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail);
|
||||
CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)myReqData->reqBuff, minSize, 0), ret, fail);
|
||||
CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail);
|
||||
|
||||
// Create a VA for the NVLS
|
||||
CUCHECKGOTO(cuMemAddressReserve(®Ptr, minSize, granularity, 0U, 0), ret, fail);
|
||||
@@ -467,26 +494,28 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, struct localRequestData *r
|
||||
CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail);
|
||||
CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(®Record, 1), ret, fail);
|
||||
regRecord->buff = myReqData->reqBuff;
|
||||
regRecord->size = myReqData->reqSize;
|
||||
regRecord->regAddr = regPtr;
|
||||
regRecord->regSize = minSize;
|
||||
regRecord->dev = comm->nvlsResources->dev;
|
||||
regRecord->mcHandle = mcHandle;
|
||||
regRecord->state |= NVLS_REG_COMPLETE;
|
||||
/* get all buffer addresses */
|
||||
NCCLCHECKGOTO(ncclCalloc(®Record->addrs, comm->localRanks), ret, fail);
|
||||
regRecord->addrs[comm->localRank] = regRecord->buff;
|
||||
NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->addrs + comm->localRank, regRecord->addrs, sizeof(uintptr_t)), ret, fail);
|
||||
/* enqueue record */
|
||||
ncclIntruQueueEnqueue(&comm->regRecordQueue, regRecord);
|
||||
regRecord->caddrs[comm->localRank] = regRecord->addr;
|
||||
NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->caddrs + comm->localRank, regRecord->caddrs, sizeof(uintptr_t)), ret, fail);
|
||||
|
||||
/* Although registration is done, we still need to check whether the offsets are same among ranks. */
|
||||
for (int i = 0; i < comm->localRanks - 1; ++i) {
|
||||
if (regData[i].offset != regData[i + 1].offset) {
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
localRegBufUsed = true;
|
||||
|
||||
exit:
|
||||
if (localRegBufUsed)
|
||||
*regAddr = (uintptr_t)regPtr + userBuff - myReqData->reqBuff;
|
||||
if (localRegBufUsed) *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
|
||||
*regUsed = localRegBufUsed;
|
||||
free(regData);
|
||||
return ret;
|
||||
fail:
|
||||
localRegBufUsed = false;
|
||||
@@ -497,77 +526,52 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
bool localRegBufUsed = false;
|
||||
struct localRegData *regData = NULL;
|
||||
struct localRequestData *reqData = NULL;
|
||||
struct ncclRegRecord *regRecordHead = NULL, *sendRegRecord = NULL, *recvRegRecord = NULL;
|
||||
struct ncclRegRequest *regRequestHead = NULL, *sendRegRequest = NULL, *recvRegRequest = NULL;
|
||||
bool sendNeedReg = false, recvNeedReg = false;
|
||||
CUdeviceptr regSendPtr = 0;
|
||||
CUdeviceptr regRecvPtr = 0;
|
||||
struct ncclReg *sendRegRecord = NULL;
|
||||
struct ncclReg *recvRegRecord = NULL;
|
||||
|
||||
*outRegBufUsed = false;
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks * 2), ret, fail);
|
||||
|
||||
/* first check whether the buffer has been registered and matches each other globally */
|
||||
regRecordHead = ncclIntruQueueHead(&comm->regRecordQueue);
|
||||
while (regRecordHead && ((sendRegRecord == NULL && sendbuff != NULL) || (recvRegRecord == NULL && recvbuff != NULL))) {
|
||||
/* check send reg record */
|
||||
if (sendRegRecord == NULL && regRecordHead->buff <= (uintptr_t)sendbuff &&
|
||||
regRecordHead->buff + regRecordHead->size >= (uintptr_t)sendbuff + sendbuffSize) {
|
||||
regData[comm->localRank].recSendbuff = regRecordHead->buff;
|
||||
regData[comm->localRank].recSendOffset = (uintptr_t)sendbuff - regRecordHead->buff;
|
||||
sendRegRecord = regRecordHead;
|
||||
if (sendbuff) {
|
||||
NCCLCHECKGOTO(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord), ret, fail);
|
||||
if (sendRegRecord) {
|
||||
memcpy(®Data[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg));
|
||||
regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr;
|
||||
}
|
||||
|
||||
/* check recv reg record */
|
||||
if (recvRegRecord == NULL && regRecordHead->buff <= (uintptr_t)recvbuff &&
|
||||
regRecordHead->buff + regRecordHead->size >= (uintptr_t)recvbuff + recvbuffSize) {
|
||||
regData[comm->localRank].recRecvbuff = regRecordHead->buff;
|
||||
regData[comm->localRank].recRecvOffset = (uintptr_t)recvbuff - regRecordHead->buff;
|
||||
recvRegRecord = regRecordHead;
|
||||
}
|
||||
regRecordHead = regRecordHead->next;
|
||||
}
|
||||
|
||||
/* prepare registration request for later reference */
|
||||
regRequestHead = ncclIntruQueueHead(&comm->regRequestQueue);
|
||||
while (regRequestHead && ((sendRegRequest == NULL && sendbuff != NULL) || (recvRegRequest == NULL && recvbuff != NULL))) {
|
||||
/* check send reg request */
|
||||
if (regRequestHead->buff <= (uintptr_t)sendbuff &&
|
||||
regRequestHead->buff + regRequestHead->size >= (uintptr_t)sendbuff + sendbuffSize) {
|
||||
regData[comm->localRank].reqSendbuff = regRequestHead->buff;
|
||||
regData[comm->localRank].reqSendSize = regRequestHead->size;
|
||||
regData[comm->localRank].reqSendOffset = (uintptr_t)sendbuff - regRequestHead->buff;
|
||||
sendRegRequest = regRequestHead;
|
||||
if (recvbuff) {
|
||||
NCCLCHECKGOTO(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord), ret, fail);
|
||||
if (recvRegRecord) {
|
||||
memcpy(®Data[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg));
|
||||
regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr;
|
||||
}
|
||||
|
||||
/* check recv reg request */
|
||||
if (regRequestHead->buff <= (uintptr_t)recvbuff &&
|
||||
regRequestHead->buff + regRequestHead->size >= (uintptr_t)recvbuff + recvbuffSize) {
|
||||
regData[comm->localRank].reqRecvbuff = regRequestHead->buff;
|
||||
regData[comm->localRank].reqRecvSize = regRequestHead->size;
|
||||
regData[comm->localRank].reqRecvOffset = (uintptr_t)recvbuff - regRequestHead->buff;
|
||||
recvRegRequest = regRequestHead;
|
||||
}
|
||||
regRequestHead = regRequestHead->next;
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank, regData, sizeof(struct localRegData)), ret, fail);
|
||||
NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank * 2, regData, sizeof(struct localRegData) * 2), ret, fail);
|
||||
|
||||
/* first check whether all local ranks find their registered buffer */
|
||||
for (int i = 0; i < comm->localRanks; ++i) {
|
||||
if (regData[i].recSendbuff == 0 || sendRegRecord->addrs[i] != regData[i].recSendbuff) {
|
||||
if ((regData[i * 2].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2].reg.caddrs[i] != regData[i * 2].reg.addr) {
|
||||
sendNeedReg = true;
|
||||
}
|
||||
|
||||
if (regData[i].recRecvbuff == 0 || recvRegRecord->addrs[i] != regData[i].recRecvbuff) {
|
||||
if ((regData[i * 2 + 1].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2 + 1].reg.caddrs[i] != regData[i * 2 + 1].reg.addr) {
|
||||
recvNeedReg = true;
|
||||
}
|
||||
|
||||
if ((regData[i * 2].reg.state & NVLS_REG_NO_SUPPORT) || (regData[i * 2 + 1].reg.state & NVLS_REG_NO_SUPPORT)) {
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
if (sendNeedReg == false) {
|
||||
for (int i = 0; i < comm->localRanks - 1; ++i) {
|
||||
if (regData[i].recSendOffset != regData[i + 1].recSendOffset) {
|
||||
if (regData[i * 2].offset != regData[(i + 1) * 2].offset) {
|
||||
/* offset are different, we cannot apply user buffer registration */
|
||||
goto fail;
|
||||
}
|
||||
@@ -575,18 +579,18 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
|
||||
|
||||
/* reuse previous registered buffer if possible */
|
||||
if (!sendNeedReg)
|
||||
regSendPtr = (CUdeviceptr)((uintptr_t)sendRegRecord->regAddr + regData[comm->localRank].recSendOffset);
|
||||
regSendPtr = (CUdeviceptr)((uintptr_t)sendRegRecord->regAddr + regData[comm->localRank * 2].offset);
|
||||
}
|
||||
|
||||
if (recvNeedReg == false) {
|
||||
for (int i = 0; i < comm->localRanks - 1; ++i) {
|
||||
if (regData[i].recRecvOffset != regData[i + 1].recRecvOffset) {
|
||||
if (regData[i * 2 + 1].offset != regData[(i + 1) * 2 + 1].offset) {
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
if (!recvNeedReg)
|
||||
regRecvPtr = (CUdeviceptr)((uintptr_t)recvRegRecord->regAddr + regData[comm->localRank].recRecvOffset);
|
||||
regRecvPtr = (CUdeviceptr)((uintptr_t)recvRegRecord->regAddr + regData[comm->localRank * 2 + 1].offset);
|
||||
}
|
||||
|
||||
if ((!sendNeedReg || sendbuff == NULL) && (!recvNeedReg || recvbuff == NULL)) {
|
||||
@@ -597,29 +601,13 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
|
||||
|
||||
/* Start Registration. Not found registered buffers, then check whether both send and recv buffer locate
|
||||
* in register request cache. */
|
||||
NCCLCHECKGOTO(ncclCalloc(&reqData, comm->localRanks), ret, fail);
|
||||
if (sendNeedReg && sendbuff != NULL) {
|
||||
/* copy request data got from previous shmem AG */
|
||||
intptr_t offset = regData[0].reqSendOffset;
|
||||
for (int i = 0; i < comm->localRanks; ++i) {
|
||||
if (regData[i].reqSendbuff == 0 || offset != regData[i].reqSendOffset) goto fail;
|
||||
reqData[i].reqBuff = regData[i].reqSendbuff;
|
||||
reqData[i].reqSize = regData[i].reqSendSize;
|
||||
reqData[i].reqOffset = regData[i].reqSendOffset;
|
||||
}
|
||||
tryRegisterBuffer(comm, reqData, (uintptr_t)sendbuff, sendbuffSize, ®SendPtr, &localRegBufUsed);
|
||||
if (sendNeedReg && sendbuff) {
|
||||
tryRegisterBuffer(comm, (uintptr_t)sendbuff, sendbuffSize, ®SendPtr, &localRegBufUsed);
|
||||
if (localRegBufUsed == false) goto fail;
|
||||
}
|
||||
|
||||
if (recvNeedReg && recvbuff != NULL) {
|
||||
intptr_t offset = regData[0].reqRecvOffset;
|
||||
for (int i = 0; i < comm->localRanks; ++i) {
|
||||
if (regData[i].reqRecvbuff == 0 || offset != regData[i].reqRecvOffset) goto fail;
|
||||
reqData[i].reqBuff = regData[i].reqRecvbuff;
|
||||
reqData[i].reqSize = regData[i].reqRecvSize;
|
||||
reqData[i].reqOffset = regData[i].reqRecvOffset;
|
||||
}
|
||||
tryRegisterBuffer(comm, reqData, (uintptr_t)recvbuff, recvbuffSize, ®RecvPtr, &localRegBufUsed);
|
||||
if (recvNeedReg && recvbuff) {
|
||||
tryRegisterBuffer(comm, (uintptr_t)recvbuff, recvbuffSize, ®RecvPtr, &localRegBufUsed);
|
||||
if (localRegBufUsed == false) goto fail;
|
||||
}
|
||||
|
||||
@@ -630,7 +618,6 @@ exit:
|
||||
*outRegBufRecv = (void*)regRecvPtr;
|
||||
*outRegBufUsed = localRegBufUsed;
|
||||
free(regData);
|
||||
free(reqData);
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
localRegBufUsed = false;
|
||||
@@ -647,7 +634,7 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
|
||||
CUmulticastObjectProp prop;
|
||||
char shareableHandle[NVLS_HANDLE_SIZE];
|
||||
CUmemGenericAllocationHandle sendMcHandle, recvMcHandle;
|
||||
size_t sendGran, recvGran;
|
||||
size_t sendGran = 0, recvGran = 0;
|
||||
bool *regBufFlags = NULL;
|
||||
struct graphRegData *rdata = NULL;
|
||||
const void *baseSend = NULL;
|
||||
@@ -667,19 +654,17 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
|
||||
if (recvbuff != NULL)
|
||||
CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff), ret, fail);
|
||||
|
||||
memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp));
|
||||
prop.size = baseSendSize;
|
||||
CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
|
||||
prop.size = baseRecvSize;
|
||||
CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
|
||||
|
||||
localRegBufUsed = ((uint64_t)baseSend % sendGran != 0 || (uint64_t)baseRecv % recvGran != 0) ? false : true;
|
||||
localRegBufUsed = ((uint64_t)baseSend % comm->nvlsResources->ucGran != 0 || (uint64_t)baseRecv % comm->nvlsResources->ucGran != 0) ? false : true;
|
||||
regBufFlags[comm->localRank] = localRegBufUsed;
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regBufFlags, sizeof(bool)), ret, fail);
|
||||
for (int i = 0; i < comm->localRanks; ++i)
|
||||
if (regBufFlags[i] == false) goto fail;
|
||||
|
||||
memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp));
|
||||
if (sendbuff != NULL) {
|
||||
prop.size = baseSendSize;
|
||||
CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
|
||||
|
||||
/* check send buffer offset and size */
|
||||
rdata[comm->localRank].offset = (uintptr_t)sendbuff - (uintptr_t)baseSend;
|
||||
rdata[comm->localRank].size = baseSendSize;
|
||||
@@ -719,6 +704,9 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
|
||||
}
|
||||
|
||||
if (recvbuff != NULL) {
|
||||
prop.size = baseRecvSize;
|
||||
CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
|
||||
|
||||
rdata[comm->localRank].offset = (uintptr_t)recvbuff - (uintptr_t)baseRecv;
|
||||
rdata[comm->localRank].size = baseRecvSize;
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail);
|
||||
|
||||
+24
-22
@@ -103,6 +103,12 @@ static void initCeOperation();
|
||||
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
initCeOperation();
|
||||
|
||||
// MNNVL support
|
||||
if (info1->hostHash != info2->hostHash) {
|
||||
NCCLCHECK(ncclTopoCheckMNNVL(topo, info1, info2, ret));
|
||||
if (*ret) return ncclSuccess;
|
||||
}
|
||||
|
||||
// Rule out different nodes / isolated containers
|
||||
if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) {
|
||||
*ret = 0;
|
||||
@@ -190,8 +196,9 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) {
|
||||
if (ncclCuMemEnable()) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
|
||||
// cuMem API support
|
||||
CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size));
|
||||
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||
@@ -227,18 +234,16 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
|
||||
#if CUDART_VERSION >= 11030
|
||||
// cuMem API support
|
||||
CUdeviceptr dptr = 0;
|
||||
CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
ncclCuDesc *cuDesc = &ipcDesc->cuDesc;
|
||||
|
||||
// Import and map the remote memory descriptor to the local GPU
|
||||
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||
// UDS fd support
|
||||
struct ncclProxyConnector proxyConn;
|
||||
int fd = -1;
|
||||
// Send cuMem handle to remote for conversion to an fd
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpPeer, &proxyConn));
|
||||
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, &proxyConn, &cuDesc->data, &fd));
|
||||
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpPeer, &cuDesc->data, &fd));
|
||||
INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, tpPeer);
|
||||
CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
|
||||
(void) close(fd);
|
||||
@@ -276,6 +281,8 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
|
||||
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
|
||||
NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0);
|
||||
|
||||
#define P2P_SAME_PID(MYINFO, PEERINFO) ((MYINFO->hostHash == PEERINFO->hostHash) && (MYINFO->pidHash == PEERINFO->pidHash))
|
||||
|
||||
static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
|
||||
int p2p;
|
||||
// Queries the topology to see if the GPUs are Ampere and
|
||||
@@ -288,7 +295,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
|
||||
}
|
||||
|
||||
static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
if (P2P_SAME_PID(myInfo, peerInfo)) {
|
||||
if (peerInfo->cudaDev != myInfo->cudaDev) {
|
||||
// Same PID different GPUs, enable P2P access
|
||||
// Legacy CUDA IPC
|
||||
@@ -316,15 +323,9 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
|
||||
*devMem = p2pBuff->directPtr;
|
||||
*ipcPtr = NULL;
|
||||
} else {
|
||||
if ((myInfo->pidHash == peerInfo->pidHash) && (peerInfo->cudaDev == myInfo->cudaDev)) {
|
||||
// Same PID and GPU
|
||||
*devMem = p2pBuff->directPtr;
|
||||
*ipcPtr = NULL;
|
||||
} else {
|
||||
// Different PID or different GPU
|
||||
NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
|
||||
*ipcPtr = *devMem;
|
||||
}
|
||||
// Different PID
|
||||
NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
|
||||
*ipcPtr = *devMem;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -354,7 +355,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
|
||||
if (intermediateRank == -1) {
|
||||
info->rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
|
||||
if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
|
||||
resources->type = P2P_DIRECT;
|
||||
send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s",
|
||||
@@ -363,8 +364,9 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
// cuMem API support
|
||||
if (ncclCuMemEnable()) {
|
||||
resources->type = P2P_CUMEM;
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/CUMEM%s%s",
|
||||
channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr, useMemcpy ? "/CE" : "");;
|
||||
const char *MNNVL = comm->MNNVL ? "MNNVL" : "CUMEM";
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/%s%s%s",
|
||||
channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, MNNVL, useReadStr, useMemcpy ? "/CE" : "");;
|
||||
} else {
|
||||
// Legacy CUDA IPC
|
||||
resources->type = P2P_IPC;
|
||||
@@ -418,7 +420,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
|
||||
if (intermediateRank == -1) {
|
||||
info->rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
|
||||
if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
|
||||
resources->type = P2P_DIRECT;
|
||||
recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
} else {
|
||||
@@ -468,7 +470,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
|
||||
if (useMemcpy) {
|
||||
send->conn.tail = &resources->proxyInfo.ceRecvMem->tail;
|
||||
send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
|
||||
send->conn.connFifo = resources->proxyInfo.ceRecvMem->connFifo;
|
||||
send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
|
||||
// Send SIMPLE buff to proxy, and replace it by local buffer
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
|
||||
@@ -712,11 +714,11 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru
|
||||
}
|
||||
if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
|
||||
volatile struct ncclConnFifo* connFifo = resources->ceRecvMem->connFifo;
|
||||
volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
|
||||
// Check GPU has sent everything
|
||||
if ((*recvTail > sub->base+sub->transmitted)) {
|
||||
int size = sizesFifo[buffSlot];
|
||||
int size = connFifo[buffSlot].size;
|
||||
CUDACHECK(cudaMemcpyAsync(resources->recvFifo+buffSlot*stepSize, resources->ceDevBuff+buffSlot*stepSize, size, cudaMemcpyDeviceToDevice, resources->stream));
|
||||
CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
|
||||
sub->transmitted += args->sliceSteps;
|
||||
|
||||
@@ -152,7 +152,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
send->conn.head = &resources->devHostMem->head;
|
||||
|
||||
if (useMemcpyRecv) {
|
||||
send->conn.sizesFifo = resources->devRemHostMem->sizesFifo;
|
||||
send->conn.connFifo = resources->devRemHostMem->connFifo;
|
||||
}
|
||||
if (useMemcpySend) {
|
||||
int tpProxyRank;
|
||||
@@ -162,7 +162,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
|
||||
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
|
||||
send->conn.tail = &proxyInfo.ceRecvMem->tail;
|
||||
send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
|
||||
send->conn.connFifo = proxyInfo.ceRecvMem->connFifo;
|
||||
}
|
||||
|
||||
// We must assign the proxyConn's proxyProgress property for proper checking at enqueue-time
|
||||
@@ -315,15 +315,15 @@ static ncclResult_t shmSendProxyProgress(struct ncclProxyState* proxyState, stru
|
||||
}
|
||||
if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
|
||||
volatile struct ncclConnFifo* connFifo = resources->ceRecvMem->connFifo;
|
||||
volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
|
||||
// Check GPU has sent everything
|
||||
if ((*recvTail > sub->base+sub->transmitted)) {
|
||||
int size = sizesFifo[buffSlot];
|
||||
int size = connFifo[buffSlot].size;
|
||||
CUDACHECK(cudaMemcpyAsync(resources->shmFifo+buffSlot*stepSize, resources->devFifo+buffSlot*stepSize, size, cudaMemcpyDeviceToHost, resources->stream));
|
||||
CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
|
||||
resources->recvMem->sizesFifo[buffSlot] = size;
|
||||
__sync_synchronize(); // make sure sizesFifo is visible
|
||||
resources->recvMem->connFifo[buffSlot].size = size;
|
||||
__sync_synchronize(); // make sure connFifo[].size is visible
|
||||
sub->transmitted += args->sliceSteps;
|
||||
}
|
||||
}
|
||||
@@ -374,11 +374,11 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru
|
||||
}
|
||||
if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->recvMem->sizesFifo;
|
||||
volatile struct ncclConnFifo* connFifo = resources->recvMem->connFifo;
|
||||
volatile uint64_t* recvTail = &resources->recvMem->tail;
|
||||
// Check data is ready in SHM
|
||||
if ((*recvTail > sub->base+sub->transmitted)) {
|
||||
int size = sizesFifo[buffSlot];
|
||||
int size = connFifo[buffSlot].size;
|
||||
CUDACHECK(cudaMemcpyAsync(resources->devFifo+buffSlot*stepSize, resources->shmFifo+buffSlot*stepSize, size, cudaMemcpyHostToDevice, resources->stream));
|
||||
CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
|
||||
sub->transmitted += args->sliceSteps;
|
||||
|
||||
Atsaukties uz šo jaunā problēmā
Block a user