Merge remote-tracking branch 'nccl/master' into develop
[ROCm/rccl commit: e1a835910e]
Этот коммит содержится в:
@@ -278,11 +278,12 @@ set(SRC_FILES
|
||||
src/enqueue.cc
|
||||
src/group.cc
|
||||
src/init.cc
|
||||
src/init_nvtx.cc
|
||||
src/net.cc
|
||||
src/msccl.cc
|
||||
src/proxy.cc
|
||||
src/register.cc
|
||||
src/transport.cc
|
||||
src/init_nvtx.cc
|
||||
# src/clique/AllReduceCliqueKernel.h
|
||||
# src/clique/CliqueCommon.h
|
||||
# src/clique/CliqueManager.cc
|
||||
@@ -370,6 +371,7 @@ set(SRC_FILES
|
||||
src/include/profiler.h
|
||||
src/include/proxy.h
|
||||
src/include/rccl_vars.h
|
||||
src/include/register.h
|
||||
src/include/rccl_float8.h
|
||||
src/include/rocm_smi_wrap.h
|
||||
src/include/rocmwrap.h
|
||||
|
||||
@@ -17,13 +17,14 @@
|
||||
#define NCCL_PTR_DMABUF 0x4
|
||||
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 8
|
||||
#define NCCL_NET_MAX_REQUESTS 32
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
#include "net_v8.h"
|
||||
#include "net_v7.h"
|
||||
#include "net_v6.h"
|
||||
#include "net_v5.h"
|
||||
|
||||
@@ -26,6 +26,7 @@ typedef struct {
|
||||
int needsProxyProgress;
|
||||
} ncclNetDeviceHandle_v7_t;
|
||||
|
||||
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
|
||||
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
#ifndef NCCL_NET_V6_H_
|
||||
#define NCCL_NET_V6_H_
|
||||
|
||||
#define NCCL_NET_MAX_REQUESTS_V6 8
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
|
||||
@@ -22,8 +22,6 @@ typedef struct {
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
} ncclNetProperties_v7_t;
|
||||
|
||||
typedef ncclNetProperties_v7_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_NET_V8_H_
|
||||
#define NCCL_NET_V8_H_
|
||||
|
||||
#include "net_device.h"
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int regIsGlobal; // regMr is not tied to a particular comm
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
} ncclNetProperties_v8_t;
|
||||
|
||||
typedef ncclNetProperties_v8_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
} ncclNet_v8_t;
|
||||
|
||||
#endif // end include guard
|
||||
@@ -15,15 +15,37 @@ __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess;
|
||||
|
||||
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v7_t* props) {
|
||||
//pluginPciPath(dev, &props.pciPath);
|
||||
//pluginPtrSupport(dev, &props.ptrSupport);
|
||||
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
// Below are default values, if unsure don't change.
|
||||
|
||||
props->name = "Example";
|
||||
// Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
|
||||
props->pciPath = NULL;
|
||||
// Only used to detect NICs with multiple PCI attachments.
|
||||
props->guid = 0;
|
||||
// Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
|
||||
props->ptrSupport = NCCL_PTR_HOST;
|
||||
// If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
|
||||
props->regIsGlobal = 0;
|
||||
// Speed in *Mbps*. 100000 means 100G
|
||||
props->speed = 100000;
|
||||
// Port number, used in conjunction with guid
|
||||
props->port = 0;
|
||||
// Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
|
||||
props->latency = 0;
|
||||
// Maximum number of comm objects we can create.
|
||||
props->maxComms = 1024*1024;
|
||||
// Maximum number of receive operations taken by irecv().
|
||||
props->maxRecvs = 1;
|
||||
// Coupling with NCCL network device-side code.
|
||||
props->netDeviceType = 0;
|
||||
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
|
||||
return ncclInternalError;
|
||||
}
|
||||
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
|
||||
__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; }
|
||||
@@ -38,7 +60,7 @@ __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_m
|
||||
|
||||
#define PLUGIN_NAME "Plugin"
|
||||
|
||||
const ncclNet_v7_t ncclNetPlugin_v7 = {
|
||||
const ncclNet_v8_t ncclNetPlugin_v8 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.devices = pluginDevices,
|
||||
@@ -60,10 +82,62 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
|
||||
.irecvConsumed = pluginIrecvConsumed,
|
||||
};
|
||||
|
||||
__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props) {
|
||||
//pluginPciPath(dev, &props.pciPath);
|
||||
//pluginPtrSupport(dev, &props.ptrSupport);
|
||||
return ncclInternalError;
|
||||
__hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) {
|
||||
ncclNetProperties_t props;
|
||||
ncclResult_t ret = pluginGetProperties(dev, &props);
|
||||
if (ret != ncclSuccess) return ret;
|
||||
props_v7->name = props.name;
|
||||
props_v7->pciPath = props.pciPath;
|
||||
props_v7->guid = props.guid;
|
||||
props_v7->ptrSupport = props.ptrSupport;
|
||||
props_v7->speed = props.speed;
|
||||
props_v7->port = props.port;
|
||||
props_v7->maxComms = props.maxComms;
|
||||
props_v7->maxRecvs = props.maxRecvs;
|
||||
props_v7->netDeviceType = props.netDeviceType;
|
||||
props_v7->netDeviceVersion = props.netDeviceVersion;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int type, void** mhandle) {
|
||||
return pluginRegMr(collComm, data, size, type, mhandle);
|
||||
}
|
||||
|
||||
const ncclNet_v7_t ncclNetPlugin_v7 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v7,
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect,
|
||||
.accept = pluginAccept,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.regMrDmaBuf = pluginRegMrDmaBuf,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend,
|
||||
.irecv = pluginIrecv,
|
||||
.iflush = pluginIflush,
|
||||
.test = pluginTest,
|
||||
.closeSend = pluginCloseSend,
|
||||
.closeRecv = pluginCloseRecv,
|
||||
.closeListen = pluginCloseListen,
|
||||
.getDeviceMr = pluginGetDeviceMr,
|
||||
.irecvConsumed = pluginIrecvConsumed,
|
||||
};
|
||||
|
||||
__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props_v6) {
|
||||
ncclNetProperties_t props;
|
||||
ncclResult_t ret = pluginGetProperties(dev, &props);
|
||||
if (ret != ncclSuccess) return ret;
|
||||
props_v6->name = props.name;
|
||||
props_v6->pciPath = props.pciPath;
|
||||
props_v6->guid = props.guid;
|
||||
props_v6->ptrSupport = props.ptrSupport;
|
||||
props_v6->speed = props.speed;
|
||||
props_v6->port = props.port;
|
||||
props_v6->maxComms = props.maxComms;
|
||||
props_v6->maxRecvs = props.maxRecvs;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; }
|
||||
@@ -77,7 +151,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v6,
|
||||
.accept = pluginAccept_v6,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.regMrDmaBuf = pluginRegMrDmaBuf,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend,
|
||||
@@ -98,7 +172,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v6,
|
||||
.accept = pluginAccept_v6,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend,
|
||||
.irecv = pluginIrecv,
|
||||
@@ -110,17 +184,17 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
|
||||
};
|
||||
|
||||
/* v4 Compat */
|
||||
static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) {
|
||||
ncclNetProperties_v6_t props_v6;
|
||||
ncclResult_t ret = pluginGetProperties_v6(dev, &props_v6);
|
||||
static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props_v4) {
|
||||
ncclNetProperties_t props;
|
||||
ncclResult_t ret = pluginGetProperties(dev, &props);
|
||||
if (ret != ncclSuccess) return ret;
|
||||
props->name = props_v6.name;
|
||||
props->pciPath = props_v6.pciPath;
|
||||
props->guid = props_v6.guid;
|
||||
props->ptrSupport = props_v6.ptrSupport;
|
||||
props->speed = props_v6.speed;
|
||||
props->port = props_v6.port;
|
||||
props->maxComms = props_v6.maxComms;
|
||||
props_v4->name = props.name;
|
||||
props_v4->pciPath = props.pciPath;
|
||||
props_v4->guid = props.guid;
|
||||
props_v4->ptrSupport = props.ptrSupport;
|
||||
props_v4->speed = props.speed;
|
||||
props_v4->port = props.port;
|
||||
props_v4->maxComms = props.maxComms;
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
|
||||
@@ -157,7 +231,7 @@ const ncclNet_v4_t ncclNetPlugin_v4 = {
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v4,
|
||||
.accept = pluginAccept_v4,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend_v4,
|
||||
.irecv = pluginIrecv_v4,
|
||||
@@ -202,7 +276,7 @@ const ncclNet_v3_t ncclNetPlugin_v3 = {
|
||||
.listen = pluginListen_v3,
|
||||
.connect = pluginConnect_v3,
|
||||
.accept = pluginAccept_v4,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend_v4,
|
||||
.irecv = pluginIrecv_v4,
|
||||
@@ -223,7 +297,7 @@ const ncclNet_v2_t ncclNetPlugin_v2 = {
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v4,
|
||||
.accept = pluginAccept_v4,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend_v4,
|
||||
.irecv = pluginIrecv_v4,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 19
|
||||
NCCL_PATCH := 4
|
||||
NCCL_MINOR := 20
|
||||
NCCL_PATCH := 5
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
|
||||
INCEXPORTS := nccl.h nccl_net.h
|
||||
LIBSRCFILES := \
|
||||
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
|
||||
init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
|
||||
init.cc init_nvtx.cc net.cc proxy.cc transport.cc register.cc \
|
||||
$(wildcard graph/*.cc) \
|
||||
$(wildcard misc/*.cc) \
|
||||
$(wildcard transport/*.cc)
|
||||
|
||||
@@ -222,6 +222,7 @@ struct bootstrapState {
|
||||
struct ncclSocket ringSendSocket;
|
||||
union ncclSocketAddress* peerCommAddresses;
|
||||
union ncclSocketAddress* peerProxyAddresses;
|
||||
uint64_t* peerProxyAddressesUDS;
|
||||
struct unexConn* unexpectedConnections;
|
||||
int cudaDev;
|
||||
int rank;
|
||||
@@ -300,6 +301,7 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
|
||||
|
||||
// Create the service proxy
|
||||
NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
|
||||
NCCLCHECK(ncclCalloc(&state->peerProxyAddressesUDS, nranks));
|
||||
|
||||
// proxy is aborted through a message; don't set abortFlag
|
||||
NCCLCHECK(ncclCalloc(&proxySocket, 1));
|
||||
@@ -307,7 +309,13 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
|
||||
NCCLCHECK(ncclSocketListen(proxySocket));
|
||||
NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank));
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
|
||||
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
|
||||
// cuMem UDS support
|
||||
// Make sure we create a unique UDS socket name
|
||||
uint64_t randId;
|
||||
NCCLCHECK(getRandomData(&randId, sizeof(randId)));
|
||||
state->peerProxyAddressesUDS[rank] = getPidHash()+randId;
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)));
|
||||
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
||||
|
||||
@@ -360,8 +368,6 @@ ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm*
|
||||
for (int i = 0; i < nranks; ++i) {
|
||||
comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
|
||||
}
|
||||
comm->proxyState = parent->sharedRes->proxyState;
|
||||
ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
|
||||
} else {
|
||||
// Create the service proxy
|
||||
NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
|
||||
@@ -371,10 +377,17 @@ ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm*
|
||||
NCCLCHECKGOTO(ncclSocketGetAddr(proxySocket, &tmpAddr), ret, fail);
|
||||
memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail);
|
||||
NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses), ret, fail);
|
||||
// cuMem UDS support
|
||||
NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail);
|
||||
// Make sure we create a unique UDS socket name
|
||||
uint64_t randId;
|
||||
NCCLCHECKGOTO(getRandomData(&randId, sizeof(randId)), ret, fail);
|
||||
state->peerProxyAddressesUDS[rank] = getPidHash()+randId;
|
||||
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)), ret, fail);
|
||||
NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT, "bootstrapSplit: rank %d nranks %d color %d key %d prev %d next %d - DONE", rank, nranks, color, key, prev, next);
|
||||
INFO(NCCL_INIT, "bootstrapSplit: comm %p parent %p rank %d nranks %d color %d key %d prev %d next %d - DONE", comm, parent, rank, nranks, color, key, prev, next);
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
@@ -573,7 +586,7 @@ ncclResult_t bootstrapClose(void* commState) {
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
if (state->unexpectedConnections != NULL) {
|
||||
unexpectedFree(state);
|
||||
if (*state->abortFlag == 0) {
|
||||
if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) {
|
||||
WARN("Unexpected connections are not empty");
|
||||
return ncclInternalError;
|
||||
}
|
||||
@@ -597,6 +610,7 @@ ncclResult_t bootstrapAbort(void* commState) {
|
||||
NCCLCHECK(ncclSocketClose(&state->ringRecvSocket));
|
||||
free(state->peerCommAddresses);
|
||||
free(state->peerProxyAddresses);
|
||||
free(state->peerProxyAddressesUDS);
|
||||
free(state);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -191,6 +191,9 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
va_start(vargs, fmt);
|
||||
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
|
||||
va_end(vargs);
|
||||
// vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
|
||||
// Rewind len so that we can replace the final \0 by \n
|
||||
if (len > sizeof(buffer)) len = sizeof(buffer)-1;
|
||||
buffer[len++] = '\n';
|
||||
fwrite(buffer, 1, len, ncclDebugFile);
|
||||
}
|
||||
|
||||
@@ -17,20 +17,21 @@ namespace {
|
||||
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
|
||||
#endif
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const int *ringRanks = ring->userRanks;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*int(chunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t channelCount = args->workCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
const size_t count = args->count;
|
||||
size_t offset;
|
||||
size_t dataOffset;
|
||||
int nelem;
|
||||
int rankDest;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
int npKitCtxIdx = bid;
|
||||
int npKitCtxIdx = gridOffset / channelCount;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
@@ -50,7 +51,7 @@ namespace {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
@@ -66,28 +67,14 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t realChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset,nChannels));
|
||||
realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
else if (Proto::Id == NCCL_PROTO_LL)
|
||||
realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
|
||||
else if (Proto::Id == NCCL_PROTO_LL128)
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
|
||||
realChunkSize = int(realChunkSize);
|
||||
|
||||
ssize_t chunkOffset = gridOffset + int(bid*realChunkSize);
|
||||
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
dataOffset = gridOffset + elemOffset;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ringRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
offset = dataOffset + rankDest * count;
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
@@ -97,10 +84,10 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
if (inputBuf + chunkOffset == outputBuf + offset) { // In place
|
||||
prims.directSend(chunkOffset, offset, nelem);
|
||||
if (inputBuf + dataOffset == outputBuf + offset) { // In place
|
||||
prims.directSend(dataOffset, offset, nelem);
|
||||
} else {
|
||||
prims.directCopySend(chunkOffset, offset, nelem);
|
||||
prims.directCopySend(dataOffset, offset, nelem);
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT)
|
||||
@@ -121,7 +108,7 @@ namespace {
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ringRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
offset = dataOffset + rankDest * count;
|
||||
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
}
|
||||
@@ -135,7 +122,7 @@ namespace {
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
rankDest = ringRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
offset = dataOffset + rankDest * count;
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY)
|
||||
if (tid == 0) {
|
||||
@@ -159,7 +146,7 @@ namespace {
|
||||
}
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_EXIT, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_EXIT, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
@@ -192,13 +179,14 @@ template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t count = args->count;
|
||||
const ssize_t rank = ncclShmem.comm.rank;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
size_t gridOffset = args->workOffset;
|
||||
size_t channelCount = args->workCount;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
const int nThreadsBcast = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
|
||||
const int nThreadsGather = args->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
|
||||
@@ -212,10 +200,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.gather(offset, nvls->nHeads * size, nelem, size, -1, 0);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndBcast) {
|
||||
// Bcast through NVLS
|
||||
@@ -223,9 +211,9 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -240,7 +228,7 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
/* used as sync */
|
||||
prims.scatter(0, 0, 0, 0, -1, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
prims.gather(0, 0, 0, 0, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndBcast) {
|
||||
@@ -251,13 +239,158 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
/* used as sync */
|
||||
prims.recv(0, 0);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t inpOffset = gridOffset + bid * chunkSize;
|
||||
ssize_t outOffset = inpOffset + rank * size;
|
||||
int nelem = min(chunkSize, size - inpOffset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
ssize_t inpOffset = gridOffset + elemOffset;
|
||||
ssize_t outOffset = inpOffset + rank * count;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directSend(inpOffset, outOffset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
|
||||
template<bool BcastSendNotRecv>
|
||||
struct Scatterer {
|
||||
struct ncclWorkElem* args;
|
||||
ssize_t chunkSize;
|
||||
ssize_t railGridOffset;
|
||||
|
||||
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
|
||||
__device__ __forceinline__ void operator()(
|
||||
int tid, int tn, int slice, int maxSliceSize,
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
|
||||
) {
|
||||
static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
|
||||
static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
|
||||
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
int nNodes = ncclShmem.comm.nNodes;
|
||||
int nRails = direct->nHeads;
|
||||
int bid = args->bid;
|
||||
char* inbuf = (char*)args->sendbuff;
|
||||
char* outbuf = (char*)args->recvbuff;
|
||||
ssize_t sizePerRank = args->count*sizeof(T);
|
||||
bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank);
|
||||
|
||||
ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
|
||||
int railAllSize = railAllEnd - railAllBeg;
|
||||
if (tid < nDsts) dstSizes[tid] = railAllSize;
|
||||
|
||||
int src = 0;
|
||||
int rail;
|
||||
if (BcastSendNotRecv) {
|
||||
rail = direct->headRank;
|
||||
} else {
|
||||
rail = direct->headRank+1;
|
||||
if (rail == nRails) rail = 0;
|
||||
}
|
||||
do {
|
||||
int node = railAllBeg/sizePerRank;
|
||||
int railAllOffset = 0;
|
||||
while (railAllOffset < railAllSize) {
|
||||
ssize_t railOneBeg = node*sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg;
|
||||
int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
|
||||
int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
|
||||
ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
|
||||
int outIsDst = (inPlace && rank == ncclShmem.comm.rank) ? 0 : 1;
|
||||
reduceCopy<ncclCollUnroll(), RedOp, T,
|
||||
/*MultimemSrcs,MinSrcs,MaxSrcs=*/0,1,1,
|
||||
/*MultimemDsts=*/0, 0+MinDsts, 1+MaxDsts,
|
||||
/*PreOpSrcs=*/0>
|
||||
(tid, tn, 0, nullptr, false,
|
||||
/*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
|
||||
return (char*)srcPtrs[src] + railAllOffset;
|
||||
},
|
||||
/*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* {
|
||||
return d < outIsDst ? outbuf + userOneBeg
|
||||
: (char*)dstPtrs[d-outIsDst] + railAllOffset;
|
||||
},
|
||||
delta);
|
||||
railAllOffset += delta;
|
||||
node += 1;
|
||||
}
|
||||
src += 1;
|
||||
rail += 1;
|
||||
if (rail == nRails) rail = 0;
|
||||
} while (!BcastSendNotRecv && src < nRails-1);
|
||||
}
|
||||
};
|
||||
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
int tid = threadIdx.x;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
int const &nNodes = ncclShmem.comm.nNodes;
|
||||
ssize_t chunkSize = int(args->chunkCount);
|
||||
ssize_t const &sizePerRank = args->count;
|
||||
|
||||
bool isMultiRail = (direct->nHeads > 1);
|
||||
int nWarps1 = 1;
|
||||
int nWarps2 = (isMultiRail ? 2 : 1);
|
||||
int nWarps3 = (isMultiRail ? 2 : 0);
|
||||
float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
|
||||
nWarps3 = int(denom*nWarps3);
|
||||
nWarps2 = int(denom*nWarps2);
|
||||
nWarps1 = args->nWarps - (nWarps2+nWarps3);
|
||||
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
|
||||
int tn = nWarps1*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 1: send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, nullptr, &direct->out, args->sendbuff, nullptr,
|
||||
/*redOpArg=*/0, 0*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
ssize_t railAllBeg = railGridOffset + args->bid*chunkSize;
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railOneBeg = ncclShmem.comm.node*sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
ssize_t beg = max(railAllBeg, railOneBeg);
|
||||
ssize_t end = min(railAllEnd, railOneEnd);
|
||||
prims.send(beg-railOneBeg, max(ssize_t(0), end-beg));
|
||||
}
|
||||
return;
|
||||
}
|
||||
tid -= tn;
|
||||
|
||||
tn = nWarps2*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 2: Recv network -> deposit output + send to bcast
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, &direct->out, direct->heads+1, nullptr, nullptr,
|
||||
/*redOpArg=*/0, 1*Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*BcastSendNotRecv=*/true> scat;
|
||||
scat.args = args;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
}
|
||||
return;
|
||||
}
|
||||
tid -= tn;
|
||||
|
||||
tn = nWarps3*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 3: Recv bcast -> deposit output
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, direct->heads+1, nullptr, nullptr, nullptr,
|
||||
/*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*BcastSendNotRecv=*/false> scat;
|
||||
scat.args = args;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/1, /*Send=*/0>(scat);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -21,18 +21,21 @@ namespace {
|
||||
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
|
||||
#endif
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
int ringIx = ring->index;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
|
||||
ssize_t chunkCount = args->chunkCount;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*nranks*chunkSize;
|
||||
const ssize_t loopCount = nranks * chunkCount;
|
||||
ssize_t offset;
|
||||
ssize_t gridOffset = args->workOffset;
|
||||
ssize_t channelCount = args->workCount;
|
||||
const ssize_t size = args->count;
|
||||
int nelem;
|
||||
int chunk;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
int npKitCtxIdx = bid;
|
||||
int npKitCtxIdx = gridOffset / channelCount;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
@@ -74,34 +77,21 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t realChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*nranks));
|
||||
realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
else
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize);
|
||||
realChunkSize = int(realChunkSize);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t remCount = channelCount - elemOffset;
|
||||
ssize_t chunkOffset;
|
||||
|
||||
if (remCount < loopCount) chunkCount = args->lastChunkCount;
|
||||
|
||||
auto calcOffset = [&]__device__(int chunk)->ssize_t {
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE)
|
||||
return gridOffset + bid*nranks*realChunkSize + chunk*realChunkSize;
|
||||
else
|
||||
return gridOffset + (chunk*nChannels + bid)*realChunkSize;
|
||||
};
|
||||
auto modRanks = [&]__device__(int r)->int {
|
||||
return r - (r >= nranks ? nranks : 0);
|
||||
};
|
||||
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int chunk;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
chunk = modRanks(ringIx + nranks-1);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
chunk = modRanks(ringIx + nranks - 1);
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
@@ -130,10 +120,11 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
chunk = modRanks(ringIx + nranks-j);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
for (int j = 2; j < nranks; ++j) {
|
||||
chunk = modRanks(ringIx + nranks - j);
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
|
||||
@@ -147,8 +138,9 @@ namespace {
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
chunk = ringIx + 0;
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
@@ -176,10 +168,11 @@ namespace {
|
||||
#endif
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
chunk = modRanks(ringIx + nranks-j);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
for (int j = 1; j < nranks - 1; ++j) {
|
||||
chunk = modRanks(ringIx + nranks - j);
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
}
|
||||
|
||||
@@ -200,8 +193,9 @@ namespace {
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
chunk = modRanks(ringIx + 1);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
prims.directRecv(offset, nelem);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
|
||||
@@ -229,21 +223,17 @@ namespace {
|
||||
__device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) {
|
||||
#endif
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
ncclTree *tree = &ncclShmem.channel.tree;
|
||||
ssize_t chunkSize = int(
|
||||
Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize
|
||||
/* LL & LL128 */ : Proto::calcBytePerStep()/sizeof(T));
|
||||
const ssize_t minChunkSize = int(
|
||||
Proto::Id == NCCL_PROTO_SIMPLE ? nthreads*8*(sizeof(uint64_t)/sizeof(T))
|
||||
/* LL & LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
|
||||
const ssize_t loopSize = int(nChannels*chunkSize);
|
||||
const size_t channelCount = args->workCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const ssize_t size = args->count;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
int npKitCtxIdx = bid;
|
||||
int npKitCtxIdx = gridOffset / channelCount;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
@@ -268,9 +258,6 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
if (loopSize > size)
|
||||
chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize);
|
||||
|
||||
{ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
|
||||
(tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
@@ -290,23 +277,23 @@ namespace {
|
||||
#endif
|
||||
|
||||
if (tree->up == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
|
||||
}
|
||||
}
|
||||
else if (tree->down[0] == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -339,23 +326,23 @@ namespace {
|
||||
#endif
|
||||
|
||||
if (tree->up == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directSendFromOutput(offset, nelem);
|
||||
}
|
||||
}
|
||||
else if (tree->down[0] == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecv(offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -385,19 +372,16 @@ namespace {
|
||||
__device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) {
|
||||
#endif
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
ncclTree *tree = &ncclShmem.channel.tree;
|
||||
ssize_t chunkSize = int(
|
||||
Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize
|
||||
: Proto::calcBytePerStep()/sizeof(T));
|
||||
const ssize_t minChunkSize = int(
|
||||
Proto::Id == NCCL_PROTO_SIMPLE ? nthreads*8*(sizeof(uint64_t)/sizeof(T)) :
|
||||
Proto::Id == NCCL_PROTO_LL ? nthreads*(Proto::calcBytePerGrain()/sizeof(T))
|
||||
/* LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T))/8);
|
||||
const ssize_t loopSize = int(nChannels*chunkSize);
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
const size_t channelCount = args->workCount;
|
||||
const ssize_t size = args->count;
|
||||
const int bid = gridOffset / channelCount;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
int nthreadsSplit;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
nthreadsSplit = nthreads/2;
|
||||
@@ -442,9 +426,6 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
if (loopSize > size)
|
||||
chunkSize = divUp((int)size, nChannels*int(minChunkSize))*int(minChunkSize);
|
||||
|
||||
if (tree->up == -1) {
|
||||
// Reduce and broadcast. Max number of recv is 2, max number of send is 2
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
@@ -464,9 +445,9 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true);
|
||||
}
|
||||
|
||||
@@ -505,16 +486,16 @@ namespace {
|
||||
#endif
|
||||
|
||||
if (tree->down[0] == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -548,16 +529,16 @@ namespace {
|
||||
#endif
|
||||
|
||||
if (tree->down[0] == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecv(offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -604,7 +585,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t chunkSize = args->chunkCount;
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t loopSize = nChannels*direct->nHeads*chunkSize;
|
||||
|
||||
@@ -700,14 +681,10 @@ template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
ssize_t chunkSize = args->chunkCount;
|
||||
const bool hasOut = nvls->out != -1;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
|
||||
const int bcastWarps = hasOut ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
|
||||
const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
|
||||
@@ -723,62 +700,114 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
|
||||
const int tidEndReduce = tidEndGather + nThreadsReduce;
|
||||
const int tidEndBcast = tidEndReduce + nThreadsBcast;
|
||||
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndGather) {
|
||||
// Gather
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce && nvls->headRank != -1) {
|
||||
if (!hasOut) {
|
||||
if (args->oneNode) {
|
||||
const ssize_t loopCount = nvls->nHeads * chunkSize;
|
||||
const ssize_t channelCount = args->workCount;
|
||||
const ssize_t gridOffset = args->workOffset;
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndGather) {
|
||||
// Gather
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce) {
|
||||
// Reduce, broadcast through NVLS
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
} else {
|
||||
// Reduce, send to network
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t chunkOffset;
|
||||
if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
|
||||
chunkOffset = elemOffset + nvls->headRank * chunkSize;
|
||||
offset = gridOffset + chunkOffset;
|
||||
nelem = min(chunkSize, channelCount - chunkOffset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
} else if (tid < tidEndBcast && nvls->headRank != -1) {
|
||||
// Recv from network, broadcast
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
} else {
|
||||
const int bid = args->bid;
|
||||
const ssize_t loopSize = args->nChannels * nvls->nHeads * chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndGather) {
|
||||
// Gather
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce && nvls->headRank != -1) {
|
||||
if (!hasOut) {
|
||||
// Reduce, broadcast through NVLS
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
} else {
|
||||
// Reduce, send to network
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
} else if (tid < tidEndBcast && nvls->headRank != -1) {
|
||||
// Recv from network, broadcast
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -788,14 +817,13 @@ template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
const int treeUp = nvls->treeUp;
|
||||
const int* treeDown = nvls->treeDown;
|
||||
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
|
||||
ssize_t chunkCount = args->chunkCount;
|
||||
const ssize_t loopCount = nvls->nHeads * chunkCount;
|
||||
const ssize_t channelCount = args->workCount;
|
||||
const ssize_t gridOffset = args->workOffset;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const bool hasUp = treeUp != -1;
|
||||
const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
|
||||
@@ -803,6 +831,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
|
||||
const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
|
||||
const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
|
||||
const int nThreadsScatter = scatterWarps*WARP_SIZE;
|
||||
const int nThreadsGather = gatherWarps*WARP_SIZE;
|
||||
@@ -819,10 +849,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.scatter(offset, nelem, chunkCount, chunkCount, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndGather) {
|
||||
// Gather
|
||||
@@ -830,10 +861,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
|
||||
int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
|
||||
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
|
||||
prims.gather(offset, nelem, chunkCount, chunkCount, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce && nvls->headRank != -1) {
|
||||
if (!hasUp) {
|
||||
@@ -842,9 +874,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t chunkOffset;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
chunkOffset = elemOffset + nvls->headRank * chunkCount;
|
||||
offset = gridOffset + chunkOffset;
|
||||
nelem = min(chunkCount, channelCount - chunkOffset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
} else {
|
||||
@@ -853,9 +888,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t chunkOffset;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
chunkOffset = elemOffset + nvls->headRank * chunkCount;
|
||||
offset = gridOffset + chunkOffset;
|
||||
nelem = min(chunkCount, channelCount - chunkOffset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -865,9 +903,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t chunkOffset;
|
||||
if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
|
||||
chunkOffset = elemOffset + nvls->headRank * chunkCount;
|
||||
offset = gridOffset + chunkOffset;
|
||||
nelem = min(chunkCount, channelCount - chunkOffset);
|
||||
prims.directRecvDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -882,7 +923,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclTree *tree = &ncclShmem.channel.collnetChain;
|
||||
ssize_t chunkSize = int(args->lastChunkSize);
|
||||
ssize_t chunkSize = args->chunkCount;
|
||||
const ssize_t loopSize = int(nChannels*chunkSize);
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t size = args->count;
|
||||
@@ -992,4 +1033,4 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL
|
||||
runTreeSplit<T, RedOp, ProtoLL128>(args);
|
||||
//LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
|
||||
}
|
||||
};
|
||||
};
|
||||
@@ -30,7 +30,7 @@ namespace {
|
||||
const ssize_t chunk_offset = elem_size * (num_elems / num_chunks * chunk_id + (chunk_id < num_padding_chunks ? chunk_id : num_padding_chunks));
|
||||
const ssize_t chunk_size = elem_size * (num_elems / num_chunks + (chunk_id < num_padding_chunks ? 1 : 0));
|
||||
const int pivot_direction = (bid % num_uni_rings) / num_bi_rings;
|
||||
const ssize_t prims_size = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLTOALL_PIVOT_CHUNKSTEPS : 1));
|
||||
const ssize_t prims_size = args->chunkCount;
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
|
||||
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0);
|
||||
@@ -39,10 +39,10 @@ namespace {
|
||||
const int src_rank = ring->userRanks[(nranks - num_hops) % nranks];
|
||||
const int dst_rank = ring->userRanks[num_hops];
|
||||
const ssize_t send_offset =
|
||||
dst_rank * num_elems * elem_size + chunk_offset +
|
||||
dst_rank * args->count + chunk_offset +
|
||||
(src_rank == dst_rank ? pivot_direction * chunk_size / 2 : 0);
|
||||
const ssize_t recv_offset =
|
||||
src_rank * num_elems * elem_size + chunk_offset +
|
||||
src_rank * args->count + chunk_offset +
|
||||
(src_rank == dst_rank ? pivot_direction * chunk_size / 2 : 0);
|
||||
const ssize_t send_recv_size =
|
||||
src_rank == dst_rank ?
|
||||
|
||||
@@ -16,20 +16,19 @@ namespace {
|
||||
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
|
||||
#endif
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1));
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
const int rank = ring->userRanks[0];
|
||||
const int nextRank = ring->userRanks[1];
|
||||
const int root = args->root;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t channelCount = args->workCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
int npKitCtxIdx = bid;
|
||||
int npKitCtxIdx = gridOffset / channelCount;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
@@ -58,20 +57,9 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t realChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
|
||||
realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
else if (Proto::Id == NCCL_PROTO_LL)
|
||||
realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
|
||||
else if (Proto::Id == NCCL_PROTO_LL128)
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
|
||||
realChunkSize = int(realChunkSize);
|
||||
|
||||
ssize_t offset = gridOffset + int(bid*realChunkSize);
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
|
||||
if (rank == root) {
|
||||
if (inputBuf == outputBuf) {
|
||||
@@ -108,4 +96,4 @@ struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL128>(args);
|
||||
}
|
||||
};
|
||||
};
|
||||
@@ -114,6 +114,7 @@ struct ncclShmemGroup {
|
||||
union {
|
||||
unpackGroupShmem unpack;
|
||||
} devicePlugin;
|
||||
int32_t dstSizes[NCCL_MAX_NVLS_ARITY+1];
|
||||
};
|
||||
|
||||
#define LDS_NUM_EVENTS 64
|
||||
|
||||
@@ -30,11 +30,11 @@ inline __device__ int loadInt(int* ptr) {
|
||||
template<typename RedFn, typename T, int Unroll, int BytePerPack,
|
||||
int MultimemSrcs, int MinSrcs, int MaxSrcs,
|
||||
int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
|
||||
typename IntBytes>
|
||||
typename IntBytes, typename SrcPtrFn, typename DstPtrFn>
|
||||
__device__ __forceinline__ void reduceCopyPacks(
|
||||
int nThreads, int &thread,
|
||||
uint64_t redArg, uint64_t *preOpArgs, bool postOp,
|
||||
int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs,
|
||||
int nSrcs, SrcPtrFn const &srcPtrFn, int nDsts, DstPtrFn const &dstPtrFn,
|
||||
IntBytes &nBytesBehind, IntBytes &nBytesAhead
|
||||
) {
|
||||
static_assert(std::is_signed<IntBytes>::value, "IntBytes must be a signed integral type.");
|
||||
@@ -68,10 +68,10 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
uintptr_t minDsts[MinDsts + !MinDsts];
|
||||
#pragma unroll
|
||||
for (int s=0; s < MinSrcs; s++)
|
||||
minSrcs[s] = cvta_to_global(srcPtrs[s]) + threadBytesBehind;
|
||||
minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
|
||||
#pragma unroll
|
||||
for (int d=0; d < MinDsts; d++)
|
||||
minDsts[d] = cvta_to_global(dstPtrs[d]) + threadBytesBehind;
|
||||
minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
|
||||
|
||||
// We dictate loop termination condition according to whether partial hunks
|
||||
// can be handled or not.
|
||||
@@ -116,7 +116,7 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
}
|
||||
|
||||
for (int s=MinSrcs; (MinSrcs < MaxSrcs) && (s < MaxSrcs) && (s < nSrcs); s++) {
|
||||
uintptr_t src = cvta_to_global(srcPtrs[s]) + threadBytesBehind;
|
||||
uintptr_t src = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
|
||||
BytePack<BytePerPack> tmp[Unroll];
|
||||
RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
|
||||
#pragma unroll Unroll
|
||||
@@ -151,7 +151,7 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
}
|
||||
}
|
||||
for (int d=MinDsts; (MinDsts < MaxDsts) && (d < MaxDsts) && (d < nDsts); d++) {
|
||||
uintptr_t dst = cvta_to_global(dstPtrs[d]) + threadBytesBehind;
|
||||
uintptr_t dst = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
|
||||
#pragma unroll Unroll
|
||||
for (int u=0; u < Unroll; u++) {
|
||||
st_global<BytePerPack>(dst, acc[u]);
|
||||
@@ -185,11 +185,11 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
template<int Unroll, typename RedFn, typename T,
|
||||
int MultimemSrcs, int MinSrcs, int MaxSrcs,
|
||||
int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
|
||||
typename IntBytes>
|
||||
typename IntBytes, typename SrcPtrFn, typename DstPtrFn>
|
||||
__device__ __forceinline__ void reduceCopy(
|
||||
int thread, int nThreads,
|
||||
uint64_t redArg, uint64_t *preOpArgs, bool postOp,
|
||||
int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs,
|
||||
int nSrcs, SrcPtrFn const &srcPtrFn, int nDsts, DstPtrFn const &dstPtrFn,
|
||||
IntBytes nElts
|
||||
) {
|
||||
static_assert(MultimemSrcs <= MinSrcs && MultimemDsts <= MinDsts, "Multimem pointers cannot exceed respective Min values.");
|
||||
@@ -200,6 +200,9 @@ __device__ __forceinline__ void reduceCopy(
|
||||
// is supported for this redfn/type.
|
||||
constexpr int BigPackSize = (MultimemSrcs == 0) ? 16 : LoadMultimem_BigPackSize<RedFn>::BigPackSize;
|
||||
|
||||
if (MaxDsts==0) return;
|
||||
if (MinDsts==0 && nDsts==0) return;
|
||||
|
||||
IntBytes nBytesBehind = 0;
|
||||
IntBytes nBytesAhead = nElts*sizeof(T);
|
||||
|
||||
@@ -210,27 +213,27 @@ __device__ __forceinline__ void reduceCopy(
|
||||
#endif
|
||||
// Check that all pointers are BigPackSize aligned.
|
||||
bool aligned = true;
|
||||
if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane]) % (BigPackSize + !BigPackSize);
|
||||
if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane]) % (BigPackSize + !BigPackSize);
|
||||
if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrFn(lane)) % (BigPackSize + !BigPackSize);
|
||||
if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrFn(lane)) % (BigPackSize + !BigPackSize);
|
||||
aligned = !(__any(!aligned));
|
||||
if (aligned) {
|
||||
#if defined(__gfx90a__)
|
||||
reduceCopyPacks<RedFn, T, ((MinSrcs > 1) ? 2 : Unroll), BigPackSize,
|
||||
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
|
||||
(nThreads, thread, redArg, preOpArgs, postOp,
|
||||
nSrcs, srcPtrs, nDsts, dstPtrs, nBytesBehind, nBytesAhead);
|
||||
nSrcs, srcPtrFn, nDsts, dstPtrFn, nBytesBehind, nBytesAhead);
|
||||
#else
|
||||
reduceCopyPacks<RedFn, T, Unroll*((MinSrcs == 1 && MinDsts == 1) ? 2 : 1), BigPackSize,
|
||||
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
|
||||
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
|
||||
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
#endif
|
||||
if (nBytesAhead == 0) return;
|
||||
|
||||
reduceCopyPacks<RedFn, T, /*Unroll=*/1, BigPackSize,
|
||||
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
|
||||
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
|
||||
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
if (nBytesAhead == 0) return;
|
||||
}
|
||||
}
|
||||
@@ -240,25 +243,43 @@ __device__ __forceinline__ void reduceCopy(
|
||||
reduceCopyPacks<RedFn, T, Unroll/2*(16/sizeof(T))/2, sizeof(T),
|
||||
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
|
||||
(nThreads, thread, redArg, preOpArgs, postOp,
|
||||
nSrcs, srcPtrs, nDsts, dstPtrs, nBytesBehind, nBytesAhead);
|
||||
nSrcs, srcPtrFn, nDsts, dstPtrFn, nBytesBehind, nBytesAhead);
|
||||
} else {
|
||||
reduceCopyPacks<RedFn, T, Unroll*(16/sizeof(T))/2, /*BytePerPack=*/sizeof(T),
|
||||
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
|
||||
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
|
||||
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
}
|
||||
#else
|
||||
reduceCopyPacks<RedFn, T, Unroll*(16/sizeof(T))/2, /*BytePerPack=*/sizeof(T),
|
||||
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
|
||||
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
|
||||
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
#endif
|
||||
if (nBytesAhead == 0) return;
|
||||
|
||||
reduceCopyPacks<RedFn, T, /*Unroll=*/1, /*BytePerPack=*/sizeof(T),
|
||||
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
|
||||
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
|
||||
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
|
||||
}
|
||||
|
||||
#endif // COMMON_KERNEL_H_
|
||||
template<int Unroll, typename RedFn, typename T,
|
||||
int MultimemSrcs, int MinSrcs, int MaxSrcs,
|
||||
int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
|
||||
typename IntBytes>
|
||||
__device__ __forceinline__ void reduceCopy(
|
||||
int thread, int nThreads,
|
||||
uint64_t redArg, uint64_t *preOpArgs, bool postOp,
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs,
|
||||
IntBytes nElts
|
||||
) {
|
||||
reduceCopy<Unroll, RedFn, T,
|
||||
MultimemSrcs, MinSrcs, MaxSrcs,
|
||||
MultimemDsts, MinDsts, MaxDsts, PreOpSrcs, IntBytes>
|
||||
(thread, nThreads, redArg, preOpArgs, postOp,
|
||||
nSrcs, [=]__device__(int i) { return srcPtrs[i]; },
|
||||
nDsts, [=]__device__(int i) { return dstPtrs[i]; }, nElts);
|
||||
}
|
||||
|
||||
#endif // COMMON_KERNEL_H_
|
||||
Исполняемый файл
+405
@@ -0,0 +1,405 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Order of redops, tys, protos, algos must match src/include/device.h
|
||||
all_colls = ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"]
|
||||
all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
|
||||
all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"]
|
||||
all_protos = ["LL","LL128","SIMPLE"]
|
||||
all_algos = ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"]
|
||||
|
||||
################################################################################
|
||||
# The first command line argument is the path to the directory to generate and
|
||||
# populate.
|
||||
|
||||
gensrc = sys.argv[1]
|
||||
|
||||
if os.path.exists(gensrc):
|
||||
for name in os.listdir(gensrc):
|
||||
os.remove(os.path.join(gensrc, name))
|
||||
#os.truncate(os.path.join(gensrc, name), 0)
|
||||
else:
|
||||
os.mkdir(gensrc)
|
||||
|
||||
################################################################################
|
||||
# The second command line argument is used as a regex to filter the functions
|
||||
# which make it into libnccl. This is helpful for reducing the binary when
|
||||
# developing device code. The regex supports non-space containing globs '*',
|
||||
# parentheses '(x)', and union 'a|b'. The string representing the function has
|
||||
# one of the forms:
|
||||
#
|
||||
# SendRecv
|
||||
# (AllGather|Broadcast) <algo> <proto>
|
||||
# (AlLReduce|Reduce|ReduceScatter) <redop> <type> <algo> <proto>
|
||||
#
|
||||
# The possible values for redop, type, algo, proto can be found in the all_<foo>
|
||||
# lists at the top of this file.
|
||||
#
|
||||
# Since the Makefile forwards this from the ONLY_FUNCS variable, useful command
|
||||
# line examples are given:
|
||||
"""
|
||||
# Only send/recv:
|
||||
make ONLY_FUNCS="SendRecv"
|
||||
|
||||
# Only non-reductions:
|
||||
make ONLY_FUNCS="AllGather * *|Broadcast * *|SendRecv"
|
||||
|
||||
# Only AllReduce sum f32 (but all algos, protos)
|
||||
make ONLY_FUNCS="AllReduce Sum f32 * *"
|
||||
|
||||
# Only AllReduce minmax i32 NVLS (but all protos)
|
||||
make ONLY_FUNCS="AllReduce MinMax i32 NVLS *"
|
||||
|
||||
# AllReduce sum <all floats> RING LL128
|
||||
make ONLY_FUNCS="AllReduce Sum f32 RING LL128"
|
||||
"""
|
||||
|
||||
# Paste all non-None arguments together with `sep`.
|
||||
def paste(sep, *args):
|
||||
return sep.join(x for x in args if x is not None)
|
||||
|
||||
func_pattern = sys.argv[2:3]
|
||||
if func_pattern and func_pattern[0]:
|
||||
import re
|
||||
func_pattern = func_pattern[0]
|
||||
func_pattern = func_pattern.replace("*", "[^ ]*")
|
||||
func_pattern += "$"
|
||||
def func_filter(*fn):
|
||||
return None is not re.match(func_pattern, paste(" ", *fn), flags=re.IGNORECASE)
|
||||
else:
|
||||
def func_filter(coll, redop, ty, algo, proto):
|
||||
return True
|
||||
|
||||
################################################################################
|
||||
|
||||
algos_of_coll = {
|
||||
"AllGather": ["RING","COLLNET_DIRECT","NVLS"],
|
||||
"AllReduce": all_algos,
|
||||
"Broadcast": ["RING"],
|
||||
"Reduce": ["RING"],
|
||||
"ReduceScatter": ["RING","COLLNET_DIRECT","NVLS"],
|
||||
"SendRecv": [None]
|
||||
}
|
||||
|
||||
coll_camel_to_lower = {
|
||||
"AllGather": "all_gather",
|
||||
"AllReduce": "all_reduce",
|
||||
"Broadcast": "broadcast",
|
||||
"Reduce": "reduce",
|
||||
"ReduceScatter": "reduce_scatter",
|
||||
"SendRecv": "sendrecv"
|
||||
}
|
||||
coll_lower_to_camel = {coll_camel_to_lower[x]: x for x in coll_camel_to_lower}
|
||||
|
||||
################################################################################
|
||||
|
||||
# Returns pair of minimum required values for (CUDART_VERSION, __CUDA_ARCH__)
|
||||
# or None if function is never supported. Note that (0, 0) encodes universal
|
||||
# support.
|
||||
def required_cuda(coll, redop, ty, algo, proto):
|
||||
cudart, arch = 0, 0
|
||||
# kernels mapped to by coll="Nop" functions have coll="Generic"
|
||||
if coll in ("SendRecv", "Generic", "Nop"): return (cudart, arch)
|
||||
|
||||
if proto!="SIMPLE" and algo not in ("RING","TREE"): return None
|
||||
|
||||
if coll in ("AllReduce","Reduce","ReduceScatter"):
|
||||
if redop=="SumPostDiv" and ty[0] not in ("i","u"): return None
|
||||
if ty=="bf16": cudart = max(cudart, 11000)
|
||||
|
||||
if "NVLS" in algo:
|
||||
if coll in ("AllReduce","Reduce","ReduceScatter"):
|
||||
# Must match ncclNvlsSupported() in src/include/device.h
|
||||
nvls_ok = ((ty in ("i32","u32","i64","u64") and redop in ("Sum","MinMax")) or
|
||||
(ty in ("f32","f64") and redop=="Sum") or
|
||||
(ty in ("f16","bf16") and redop in ("Sum","MinMax")))
|
||||
if not nvls_ok: return None
|
||||
cudart = max(cudart, 12010)
|
||||
arch = max(arch, 900)
|
||||
|
||||
return (cudart, arch)
|
||||
|
||||
# Maps functions to the chosen representative for the equivalence class it
|
||||
# belongs to. For instance (sum, signed int) maps to (sum, unsigned int).
|
||||
def equivalent_primary(coll, redop, ty, algo, proto):
|
||||
if coll in ("AllReduce", "Reduce", "ReduceScatter"):
|
||||
# map signed integer sum/prod to unsigned
|
||||
if redop in ("Sum","Prod","PreMulSum") and ty[0]=="i":
|
||||
return (coll, redop, "u"+ty[1:], algo, proto)
|
||||
# map signed integer min/max to unsigned for non-NVLS
|
||||
if redop=="MinMax" and ty[0]=="i" and ("NVLS" not in algo):
|
||||
return (coll, redop, "u"+ty[1:], algo, proto)
|
||||
return (coll, redop, ty, algo, proto)
|
||||
|
||||
# Map to another func representing the best kernel to use. Every distinct value
|
||||
# returned will instantiate a ncclDevKernel specialized to run this func
|
||||
# without function call overhead.
|
||||
def best_kernel(coll, redop, ty, algo, proto):
|
||||
def best(coll, redop, ty, algo, proto):
|
||||
# Modify this logic to control how many kernels are specialized.
|
||||
if coll=="Nop": return ("Generic", None, None, None, None)
|
||||
if coll=="SendRecv": return ("SendRecv", None, None, None, None)
|
||||
if coll in ("AllGather","Broadcast"): return (coll, None, None, "RING", "LL")
|
||||
return (coll, "Sum", ty, ("TREE" if algo=="TREE" else "RING"), "LL")
|
||||
# Need to ensure kernel is specialize for a primary function
|
||||
kfn = equivalent_primary(*best(coll, redop, ty, algo, proto))
|
||||
# And isn't filtered out.
|
||||
if not func_filter(*kfn): return ("Generic", None, None, None, None)
|
||||
return kfn
|
||||
|
||||
# Order rows are enumerated must match formula of `ncclDevFuncId()`:
|
||||
def enumerate_func_rows():
|
||||
yield ("SendRecv", None, None, None, None)
|
||||
for coll in ("AllGather", "Broadcast"):
|
||||
algos = algos_of_coll[coll]
|
||||
for algo in algos:
|
||||
for proto in all_protos:
|
||||
yield (coll, None, None, algo, proto)
|
||||
for coll in ("AllReduce", "Reduce", "ReduceScatter"):
|
||||
algos = algos_of_coll[coll]
|
||||
for redop in all_redops:
|
||||
for ty in all_tys:
|
||||
for algo in algos:
|
||||
for proto in all_protos:
|
||||
yield (coll, redop, ty, algo, proto)
|
||||
|
||||
################################################################################
|
||||
|
||||
def is_built(coll, redop, ty, algo, proto):
|
||||
built = required_cuda(coll, redop, ty, algo, proto)
|
||||
built = built and func_filter(coll, redop, ty, algo, proto)
|
||||
return built
|
||||
|
||||
# Returns None if required_cuda(...) is None.
|
||||
# Returns the coll="Nop" function if developer has filtered it out.
|
||||
# Otherwise just returns func it was given.
|
||||
def validate(coll, redop, ty, algo, proto):
|
||||
valid = required_cuda(coll, redop, ty, algo, proto)
|
||||
built = valid and func_filter(coll, redop, ty, algo, proto)
|
||||
if built: return (coll, redop, ty, algo, proto)
|
||||
if valid: return ("Nop", None, None, None, None)
|
||||
return None
|
||||
|
||||
# Corresponds to ncclDevFuncRowToId[]
|
||||
func_rows = [validate(*fn) for fn in enumerate_func_rows()]
|
||||
|
||||
# Corresponds to ncclDevFuncTable[]
|
||||
primary_funcs = sorted(set(equivalent_primary(*fn) for fn in func_rows if fn is not None))
|
||||
|
||||
# primary_to_index[primary_funcs[i]] == i
|
||||
primary_to_index = {fn: i for (i,fn) in zip(range(len(primary_funcs)), primary_funcs)}
|
||||
|
||||
kernel_funcs = sorted(set(best_kernel(*fn) for fn in primary_funcs))
|
||||
|
||||
################################################################################
|
||||
|
||||
# Generate <gensrc>/device_table.cu
|
||||
with open(os.path.join(gensrc, "device_table.cu"), "w") as f:
|
||||
out = f.write
|
||||
out('#include "common.h"\n')
|
||||
out("\n")
|
||||
|
||||
for fn in primary_funcs:
|
||||
sym = paste("_", "ncclDevFunc", *fn)
|
||||
cudart, arch = required_cuda(*fn)
|
||||
if (cudart, arch) != (0, 0):
|
||||
out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
|
||||
out("__device__ void %s();\n" % sym)
|
||||
if (cudart, arch) != (0, 0):
|
||||
out("#endif\n")
|
||||
out("\n")
|
||||
|
||||
out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n");
|
||||
index = 0
|
||||
for fn in primary_funcs:
|
||||
sym = paste("_", "ncclDevFunc", *fn)
|
||||
cudart, arch = required_cuda(*fn)
|
||||
if (cudart, arch) != (0, 0):
|
||||
out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart ,arch))
|
||||
out("/*%4d*/ %s,\n" % (index, sym))
|
||||
if (cudart, arch) != (0, 0):
|
||||
out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
|
||||
index += 1
|
||||
out("nullptr};\n")
|
||||
out("\n")
|
||||
|
||||
out("// Workaround for https://reviews.llvm.org/D55580\n"
|
||||
"__device__ void ncclWorkaroundClangD55580() {}\n")
|
||||
|
||||
# Generate <gensrc>/host_table.cc
|
||||
with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
|
||||
out = f.write
|
||||
out('#include "device.h"\n')
|
||||
out("\n")
|
||||
|
||||
# The mapping from function rows to valid primary function ids.
|
||||
out("extern int const ncclDevFuncRowToId[] = {\n")
|
||||
index = 0
|
||||
for fn in func_rows:
|
||||
fn_id, comment = -1, ""
|
||||
if fn is not None:
|
||||
fn_id = primary_to_index[equivalent_primary(*fn)]
|
||||
comment = " // " + paste(" ", *fn)
|
||||
out("/*%4d*/ %d,%s\n" % (index, fn_id, comment))
|
||||
index += 1
|
||||
out("-1};\n")
|
||||
out("\n")
|
||||
|
||||
# Forward declarations of kernels.
|
||||
for kfn in kernel_funcs:
|
||||
cudart, _ = required_cuda(*kfn)
|
||||
sym = paste("_", "ncclDevKernel", *kfn)
|
||||
if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
|
||||
out("__global__ void %s(struct ncclDevComm*, uint64_t, struct ncclWork*);\n" % sym)
|
||||
if cudart != 0: out("#endif\n")
|
||||
out("\n")
|
||||
|
||||
# List of all kernel function pointers.
|
||||
out("extern int const ncclDevKernelCount = %d;\n" % len(kernel_funcs))
|
||||
out("extern void* const ncclDevKernelList[] = {\n")
|
||||
index = 0
|
||||
for kfn in kernel_funcs:
|
||||
cudart, _ = required_cuda(*kfn)
|
||||
sym = paste("_", "ncclDevKernel", *kfn)
|
||||
if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
|
||||
out("/*%4d*/ (void*)%s,\n" % (index, sym));
|
||||
if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
|
||||
index += 1
|
||||
out("nullptr};\n")
|
||||
out("\n")
|
||||
|
||||
# Maps primary id to kernel function pointer.
|
||||
out("extern void* const ncclDevKernelForFunc[] = {\n")
|
||||
index = 0
|
||||
for fn in primary_funcs:
|
||||
kfn = best_kernel(*fn)
|
||||
sym = paste("_", "ncclDevKernel", *kfn)
|
||||
cudart, _ = required_cuda(*kfn)
|
||||
if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
|
||||
out("/*%4d*/ (void*)%s,\n" % (index, sym))
|
||||
if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
|
||||
index += 1
|
||||
out("nullptr};\n")
|
||||
out("\n")
|
||||
|
||||
# Does the prior map use an explicitly specialized kernel.
|
||||
out("extern bool const ncclDevKernelForFuncIsSpecialized[] = {\n")
|
||||
index = 0
|
||||
for fn in primary_funcs:
|
||||
kfn = best_kernel(*fn)
|
||||
specialized = "1" if fn == kfn else "0"
|
||||
out("/*%4d*/ %s,\n" % (index, specialized))
|
||||
index += 1
|
||||
out("0};\n")
|
||||
|
||||
# Maps to .cu filename which implements this func. The only constraint is that
|
||||
# "coll" is reflected in the name: formally that no two funcs having different
|
||||
# coll's map to the same filename.
|
||||
def impl_filename(coll, redop, ty, algo, proto):
|
||||
return "%s.cu" % paste("_", coll_camel_to_lower[coll], redop and redop.lower(), ty)
|
||||
|
||||
# Partition the functions and kernels to the .cu filenames. The partition is
|
||||
# a dictionary mapping filename to (coll, func-tuple list)
|
||||
def partition_by_name(fns):
|
||||
ans = {}
|
||||
for fn in fns:
|
||||
name = impl_filename(*fn)
|
||||
coll = fn[0]
|
||||
if name not in ans:
|
||||
ans[name] = (coll, [])
|
||||
ans[name][1].append(fn)
|
||||
return ans
|
||||
|
||||
name_to_funcs = partition_by_name(fn for fn in primary_funcs if fn[0]!="Nop")
|
||||
name_to_kernels = partition_by_name(kfn for kfn in kernel_funcs if kfn[0]!="Generic")
|
||||
|
||||
# Generate <gensrc>/rules.mk
|
||||
with open(os.path.join(gensrc, "rules.mk"), "w") as f:
|
||||
out = f.write
|
||||
impl_names = sorted(name_to_funcs.keys())
|
||||
names = impl_names + ["host_table.cc", "device_table.cu"]
|
||||
out("LIB_OBJS_GEN = $(patsubst %, $(OBJDIR)/genobj/%.o, {names})\n"
|
||||
.format(names=" ".join(names)))
|
||||
out("\n")
|
||||
|
||||
# For each <coll>_<op>_<ty>.cu compile to a .cu.o file. Notice the dependencies
|
||||
# come from the suffix-erased file (e.g. 'gensrc/all_reduce.cu')
|
||||
for name in impl_names:
|
||||
coll = name_to_funcs[name][0]
|
||||
out(
|
||||
"$(OBJDIR)/genobj/{name}.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/{lower_coll}.cu.d\n"
|
||||
"\t" "$(call COMPILE,$@,$(OBJDIR)/gensrc/{name})\n"
|
||||
"\n"
|
||||
.format(name=name, lower_coll=coll_camel_to_lower[coll])
|
||||
)
|
||||
|
||||
# Add the suffix-erased .cu's which are used only for dependency scraping.
|
||||
for coll in set(coll for (coll,_,_,_,_) in primary_funcs if coll!="Nop"):
|
||||
name = impl_filename(coll, None, None, None, None)
|
||||
if name not in name_to_funcs:
|
||||
name_to_funcs[name] = (coll, [])
|
||||
|
||||
redop_to_cxx = {
|
||||
None: "FuncCopy",
|
||||
"Sum": "FuncSum",
|
||||
"Prod": "FuncProd",
|
||||
"MinMax": "FuncMinMax",
|
||||
"PreMulSum": "FuncPreMulSum",
|
||||
"SumPostDiv": "FuncSumPostDiv"
|
||||
}
|
||||
|
||||
ty_to_cxx = {
|
||||
None: "int8_t",
|
||||
"i8": "int8_t",
|
||||
"u8": "uint8_t",
|
||||
"i32": "int32_t",
|
||||
"u32": "uint32_t",
|
||||
"i64": "int64_t",
|
||||
"u64": "uint64_t",
|
||||
"f16": "half",
|
||||
"f32": "float",
|
||||
"f64": "double",
|
||||
"bf16": "__nv_bfloat16"
|
||||
}
|
||||
|
||||
# Generate each <gensrc>/<impl>.cu:
|
||||
for name in name_to_funcs.keys():
|
||||
(coll, fns) = name_to_funcs[name]
|
||||
with open(os.path.join(gensrc, name), "w") as f:
|
||||
out = f.write
|
||||
out(
|
||||
'#include "common.h"\n'
|
||||
'#include "{lower_coll}.h"\n'
|
||||
.format(lower_coll=coll_camel_to_lower[coll])
|
||||
)
|
||||
|
||||
(_, kfns) = name_to_kernels.get(name) or (None, [])
|
||||
for kfn in kfns:
|
||||
(coll, redop, ty, algo, proto) = kfn
|
||||
sym = paste("_", coll, redop, ty, algo, proto)
|
||||
fn_id = primary_to_index[kfn]
|
||||
cudart, arch = required_cuda(*kfn)
|
||||
if (cudart, arch) != (0, 0):
|
||||
out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
|
||||
out(
|
||||
"DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n"
|
||||
.format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
|
||||
algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id)
|
||||
)
|
||||
if (cudart, arch) != (0, 0):
|
||||
out("#endif\n")
|
||||
|
||||
for fn in fns:
|
||||
(coll, redop, ty, algo, proto) = fn
|
||||
sym = paste("_", coll, redop, ty, algo, proto)
|
||||
cudart, arch = required_cuda(*fn)
|
||||
if (cudart, arch) != (0, 0):
|
||||
out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
|
||||
out(
|
||||
"DEFINE_ncclDevFunc({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto})\n"
|
||||
.format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
|
||||
algo=(algo or "RING"), proto=(proto or "SIMPLE"))
|
||||
)
|
||||
if (cudart, arch) != (0, 0):
|
||||
out("#endif\n")
|
||||
@@ -44,7 +44,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
uint64_t recvConnHead;
|
||||
|
||||
struct ncclConnInfo* sendConn = NULL;
|
||||
volatile int* sendConnFifoPtr = NULL;
|
||||
volatile struct ncclConnFifo* sendConnFifo = NULL;
|
||||
volatile uint64_t* sendConnHeadPtr = NULL;
|
||||
uint64_t sendConnHead;
|
||||
uint64_t sendConnHeadCache; // Cache last seen value
|
||||
@@ -114,10 +114,9 @@ private:
|
||||
sendConnHeadCache = atomicAdd((unsigned long long *)sendConnHeadPtr, 0);
|
||||
if (checkAbort(spins, 1)) break;
|
||||
}
|
||||
__asm__ __volatile__("s_wakeup");
|
||||
if (sendConnFifoPtr) {
|
||||
if (sendConnFifo) {
|
||||
int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
|
||||
__atomic_store_n(sendConnFifoPtr+sendConnHead%NCCL_STEPS, (size), __ATOMIC_RELAXED);
|
||||
sendConnFifo[sendConnHead%NCCL_STEPS].size = size;
|
||||
}
|
||||
sendConnHead += 1;
|
||||
}
|
||||
@@ -586,7 +585,7 @@ private:
|
||||
sendConnHeadPtr = sendConn->head;
|
||||
sendConnHeadCache = *sendConnHeadPtr;
|
||||
sendConnHead = sendConn->step;
|
||||
sendConnFifoPtr = sendConn->sizesFifo;
|
||||
sendConnFifo = sendConn->connFifo;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -594,7 +593,7 @@ private:
|
||||
__device__ Primitives(
|
||||
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
|
||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
|
||||
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
|
||||
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
|
||||
):
|
||||
redOp(redOpArg),
|
||||
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
|
||||
@@ -772,4 +771,4 @@ private:
|
||||
__device__ void localCopy(T* srcs, T* dsts, int eltN) {
|
||||
return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
|
||||
}
|
||||
};
|
||||
};
|
||||
@@ -42,7 +42,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
||||
uint64_t recvConnHead;
|
||||
|
||||
struct ncclConnInfo* sendConn = NULL;
|
||||
volatile int* sendConnFifoPtr = NULL;
|
||||
volatile struct ncclConnFifo* sendConnFifo = NULL;
|
||||
volatile uint64_t* sendConnTailPtr = NULL;
|
||||
uint64_t sendConnTail;
|
||||
volatile uint64_t* sendConnHeadPtr = NULL;
|
||||
@@ -102,9 +102,8 @@ private:
|
||||
sendConnHeadCache = __atomic_load_n(sendConnHeadPtr, __ATOMIC_RELAXED);
|
||||
if (checkAbort(spins, wid, 1)) break;
|
||||
}
|
||||
__asm__ __volatile__("s_wakeup");
|
||||
if (sendConnFifoPtr) {
|
||||
__atomic_store_n(sendConnFifoPtr+sendStep[wid]%NCCL_STEPS, nbytes, __ATOMIC_RELAXED);
|
||||
if (sendConnFifo) {
|
||||
sendConnFifo[sendStep[wid]%NCCL_STEPS].size = nbytes;
|
||||
}
|
||||
sendConnHead += 1;
|
||||
}
|
||||
@@ -487,10 +486,10 @@ private:
|
||||
sendConnHeadPtr = sendConn->head;
|
||||
sendConnHeadCache = *sendConnHeadPtr;
|
||||
sendConnHead = sendConn->step;
|
||||
sendConnFifoPtr = sendConn->sizesFifo;
|
||||
sendConnFifo = sendConn->connFifo;
|
||||
}
|
||||
if (tid >= nthreads-WARP_SIZE && wid<fan.nsend()) {
|
||||
if (sendConn->sizesFifo) {
|
||||
if (sendConn->connFifo) {
|
||||
sendConnTailPtr = sendConn->tail;
|
||||
sendConnTail = sendConn->step;
|
||||
}
|
||||
@@ -581,4 +580,4 @@ public:
|
||||
__device__ void localCopy(T* srcs, T* dsts, int eltN) {
|
||||
return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
|
||||
}
|
||||
};
|
||||
};
|
||||
@@ -27,8 +27,8 @@ class Primitives<
|
||||
RolePostSend = 0x10,
|
||||
RolePostRecv = 0x20,
|
||||
Aborted = 0x40,
|
||||
OffsFifoEnabled = 0x80,
|
||||
SizesFifoEnabled = 0x100,
|
||||
UserBufferMode = 0x80,
|
||||
ConnFifoEnabled = 0x100,
|
||||
DirectWrite = 0x200,
|
||||
DirectRead = 0x400,
|
||||
ThreadsSynced = 0x800,
|
||||
@@ -46,15 +46,12 @@ class Primitives<
|
||||
int flags;
|
||||
int group;
|
||||
uint64_t step;
|
||||
int *connOffsFifoPtr; // (flags & OffsFifoEnabled)
|
||||
struct ncclConnFifo* connFifo = NULL;
|
||||
union {
|
||||
T *userBuff; // (flags & (RoleInput|RoleOutput))
|
||||
T *connEltsFifo; // !(flags & (RoleInput|RoleOutput))
|
||||
};
|
||||
union {
|
||||
int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled)
|
||||
T *directBuff; // !(flags & SizesFifoEnabled)
|
||||
};
|
||||
T *directBuff;
|
||||
uint64_t *connStepPtr;
|
||||
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
|
||||
uint64_t* barriers;
|
||||
@@ -129,14 +126,16 @@ private:
|
||||
}
|
||||
|
||||
if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
|
||||
if (isSendNotRecv && (flags & SizesFifoEnabled))
|
||||
__atomic_store_n(connSizesFifoPtr+step%NCCL_STEPS, nelts*sizeof(T), __ATOMIC_RELAXED);
|
||||
if (flags & ConnFifoEnabled)
|
||||
connFifo[step%NCCL_STEPS].size = nelts*sizeof(T);
|
||||
|
||||
void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
|
||||
: (ncclShmem.groups[group].srcs + Src);
|
||||
if (flags & OffsFifoEnabled)
|
||||
ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
|
||||
else if (isSendNotRecv && DirectSend) {
|
||||
if (flags & UserBufferMode) {
|
||||
// Do nothing
|
||||
} else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
|
||||
ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T);
|
||||
} else if (isSendNotRecv && DirectSend) {
|
||||
if (flags & (DirectWrite | NvlsDirectWrite)) {
|
||||
ptrs[index] = directBuff + dstIx + offset;
|
||||
} else if (flags & DirectRead) { // empty send
|
||||
@@ -196,7 +195,7 @@ private:
|
||||
int slice = 0;
|
||||
int offset = 0;
|
||||
|
||||
if (tid < nworkers && offset < nelem) {
|
||||
if (tid < nworkers && offset < nelem && ((flags & UserBufferMode) == 0)) {
|
||||
// Worker-only loop for non-empty slices. Non-workers and empty slices are
|
||||
// processed in the loop following this if block. The benefit of splitting
|
||||
// the loop like this is we pull two branches out of the critical path.
|
||||
@@ -422,6 +421,55 @@ private:
|
||||
barrier();
|
||||
}
|
||||
|
||||
public:
|
||||
template<int Recv, int Send, typename Fn>
|
||||
__device__ __forceinline__ void process(Fn &&fn) {
|
||||
#pragma unroll 1
|
||||
for (int slice=0; slice < SlicePerChunk; slice++) {
|
||||
if (tid < nworkers) {
|
||||
if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
|
||||
bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
|
||||
int spins = 0;
|
||||
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
if (checkAbort(spins)) break;
|
||||
}
|
||||
void **ptrs = isSendNotRecv ? ncclShmem.groups[group].dsts
|
||||
: ncclShmem.groups[group].srcs;
|
||||
if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
|
||||
int offset = loadInt(&connFifo[step%NCCL_STEPS].offset);
|
||||
ptrs[index] = connEltsFifo + offset/sizeof(T);
|
||||
} else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
|
||||
}
|
||||
}
|
||||
subBarrier();
|
||||
fn.template operator()<SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend>
|
||||
(tid, nworkers, slice, stepSize*StepPerSlice,
|
||||
fan.nrecv(), ncclShmem.groups[group].srcs,
|
||||
fan.nsend(), ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes);
|
||||
}
|
||||
barrier();
|
||||
int32_t dstSize = 0;
|
||||
if (flags & Send*RolePostSend) {
|
||||
dstSize = ncclShmem.groups[group].dstSizes[index];
|
||||
ncclShmem.groups[group].dstSizes[index] = 0;
|
||||
if (flags & ConnFifoEnabled) connFifo[step%NCCL_STEPS].size = dstSize*sizeof(T);
|
||||
}
|
||||
barrier();
|
||||
if (flags & (Recv*(RoleWaitRecv|RolePostRecv) | Send*(RoleWaitSend|RolePostSend))) {
|
||||
step += StepPerSlice;
|
||||
}
|
||||
if (flags & (Recv*RolePostRecv | Send*RolePostSend)) {
|
||||
if (Send && (!Recv || (flags & RolePostSend)) && (dstSize!=0 || (flags&ConnFifoEnabled))) {
|
||||
fence_acq_rel_sys();
|
||||
}
|
||||
st_relaxed_sys_global(connStepPtr, step);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Scatter/Gather generic op
|
||||
// skip: my own rank order in the buffer chunks
|
||||
// shift: peer offset to avoid all ranks sending to or receiving from same peer
|
||||
@@ -507,8 +555,11 @@ private:
|
||||
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
|
||||
connStepPtr = conn->tail;
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
|
||||
if (Direct) {
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
if (conn->connFifo != nullptr) {
|
||||
flags |= ConnFifoEnabled;
|
||||
connFifo = conn->connFifo;
|
||||
} else if (Direct) {
|
||||
// User buffers have been registered
|
||||
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
|
||||
if (connIndex == 1 && P2p == 0) {
|
||||
@@ -530,9 +581,6 @@ private:
|
||||
flags |= NvlsDirectRead;
|
||||
}
|
||||
}
|
||||
if (flags & OffsFifoEnabled)
|
||||
connOffsFifoPtr = conn->offsFifo;
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -542,6 +590,10 @@ private:
|
||||
auto *conn = &peer->send[connIndex];
|
||||
step = conn->step;
|
||||
step = roundUp(step, SlicePerChunk*StepPerSlice);
|
||||
|
||||
connFifo = conn->connFifo;
|
||||
if (connFifo != nullptr) flags |= ConnFifoEnabled;
|
||||
|
||||
if (flags & RolePostSend) {
|
||||
connStepPtr = conn->tail;
|
||||
next_hdp_reg = conn->next_hdp_reg;
|
||||
@@ -552,15 +604,8 @@ private:
|
||||
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
|
||||
connStepPtr = conn->head;
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
|
||||
if (flags & OffsFifoEnabled)
|
||||
connOffsFifoPtr = conn->offsFifo;
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
|
||||
if (conn->sizesFifo != nullptr) {
|
||||
flags |= SizesFifoEnabled;
|
||||
connSizesFifoPtr = conn->sizesFifo;
|
||||
} else if (Direct) {
|
||||
if (connFifo == nullptr && Direct) {
|
||||
// User buffers have been registered
|
||||
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
|
||||
if (connIndex == 1 && P2p == 0) {
|
||||
@@ -590,7 +635,7 @@ private:
|
||||
__forceinline__ __device__ Primitives(
|
||||
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
|
||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
|
||||
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, int stepSize_=0
|
||||
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
|
||||
):
|
||||
tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
|
||||
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
|
||||
@@ -631,6 +676,8 @@ private:
|
||||
loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
|
||||
loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
|
||||
|
||||
if (p2p && p2p->reg) flags |= UserBufferMode;
|
||||
|
||||
// if (barrierAny(flags & NetDeviceUnpack)) {
|
||||
// flags |= AnyNetDeviceUnpack;
|
||||
// // g == 0 is the first ThreadPerSync # of threads of this warp
|
||||
@@ -657,10 +704,21 @@ private:
|
||||
auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
|
||||
conns[index]->step = step;
|
||||
}
|
||||
|
||||
if ((flags & UserBufferMode) && (flags & RoleWaitSend)) {
|
||||
// Make sure we wait until the proxy has sent data before we return.
|
||||
// We don't want the next CUDA kernel to overwrite the send buffer which
|
||||
// was accessed directly.
|
||||
uint64_t prevStep = step - StepPerSlice;
|
||||
volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size);
|
||||
while (*ptr != -1);
|
||||
}
|
||||
|
||||
if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) {
|
||||
ncclNetDeviceSaveHead(netDeviceHandle, group);
|
||||
}
|
||||
|
||||
// Make sure all threads are done writing back conn->step and done using
|
||||
// ncclShmem.groups[group]
|
||||
barrier();
|
||||
}
|
||||
|
||||
@@ -860,4 +918,4 @@ private:
|
||||
__device__ __forceinline__ void localCopy(T* srcs, T* dsts, int eltN) {
|
||||
return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
|
||||
}
|
||||
};
|
||||
};
|
||||
@@ -17,56 +17,39 @@ namespace {
|
||||
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
|
||||
#endif
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const int nthreads = (int)args->nWarps * WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
const int prevRank = ring->userRanks[nranks-1];
|
||||
const int root = args->root;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t channelCount = args->workCount;
|
||||
const size_t gridOffset = args->workOffset;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
|
||||
|
||||
auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
|
||||
int realChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
|
||||
realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
else if (Proto::Id == NCCL_PROTO_LL)
|
||||
realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
|
||||
else if (Proto::Id == NCCL_PROTO_LL128)
|
||||
realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
|
||||
return realChunkSize;
|
||||
};
|
||||
|
||||
if (prevRank == root) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = calcChunkSize(gridOffset);
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
}
|
||||
else if (rank == root) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = calcChunkSize(gridOffset);
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = calcChunkSize(gridOffset);
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -93,4 +76,4 @@ struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL128>(args);
|
||||
}
|
||||
};
|
||||
};
|
||||
@@ -663,7 +663,7 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
|
||||
BytePack<2*sizeof(T)> tmp; \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
|
||||
: "l"(addr & -uintptr_t(sizeof(T)))); \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
|
||||
return tmp.half[(addr/sizeof(T))%2]; \
|
||||
} \
|
||||
};
|
||||
@@ -676,11 +676,11 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
|
||||
if (fn.isMinNotMax) { \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
|
||||
: "l"(addr & -uintptr_t(sizeof(T)))); \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
|
||||
} else { \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
|
||||
: "l"(addr & -uintptr_t(sizeof(T)))); \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
|
||||
} \
|
||||
return tmp.half[(addr/sizeof(T))%2]; \
|
||||
} \
|
||||
|
||||
@@ -17,56 +17,43 @@ namespace {
|
||||
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
|
||||
#endif
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
const uint32_t nthreads = (uint32_t)args->nWarps * WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
int const *ringRanks = ring->userRanks;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
size_t channelCount = args->workCount;
|
||||
size_t gridOffset = args->workOffset;
|
||||
size_t offset;
|
||||
size_t dataOffset;
|
||||
size_t count = args->count;
|
||||
uint32_t nelem;
|
||||
int rankDest;
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t realChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
|
||||
realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
}
|
||||
else if (Proto::Id == NCCL_PROTO_LL)
|
||||
realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
|
||||
else if (Proto::Id == NCCL_PROTO_LL128)
|
||||
realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
|
||||
realChunkSize = int(realChunkSize);
|
||||
|
||||
ssize_t chunkOffset = gridOffset + bid*int(realChunkSize);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
|
||||
dataOffset = gridOffset + elemOffset;
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ringRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
offset = dataOffset + rankDest * count;
|
||||
prims.send(offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ringRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
offset = dataOffset + rankDest * count;
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final result
|
||||
rankDest = ringRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
prims.recvReduceCopy(offset, chunkOffset, nelem, /*postOp=*/true);
|
||||
offset = dataOffset + rankDest * count;
|
||||
prims.recvReduceCopy(offset, dataOffset, nelem, /*postOp=*/true);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -97,14 +84,15 @@ template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const size_t chunkCount = args->chunkCount;
|
||||
const size_t count = args->count;
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
size_t gridOffset = args->workOffset;
|
||||
size_t channelCount = args->workCount;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
/* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync;
|
||||
* if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
|
||||
@@ -121,10 +109,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.scatter(offset, nvls->nHeads * size, nelem, size, -1, 0);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndReduce) {
|
||||
// Reduce through NVLS
|
||||
@@ -132,9 +120,9 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recv(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -145,7 +133,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
|
||||
args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
prims.scatter(0, 0, 0, 0, -1, 0);
|
||||
}
|
||||
|
||||
@@ -157,10 +145,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, args->recvbuff,
|
||||
args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t outOffset = gridOffset + bid * chunkSize;
|
||||
ssize_t inpOffset = outOffset + rank * size;
|
||||
int nelem = min(chunkSize, size - outOffset);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
size_t outOffset = gridOffset + elemOffset;
|
||||
size_t inpOffset = outOffset + rank * count;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecvCopy(inpOffset, outOffset, nelem);
|
||||
}
|
||||
|
||||
@@ -170,3 +158,146 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
|
||||
template<bool ReduceSendNotRecv>
|
||||
struct Scatterer {
|
||||
struct ncclWorkElem* args;
|
||||
int chunkSize;
|
||||
ssize_t railGridOffset;
|
||||
|
||||
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
|
||||
__device__ __forceinline__ void operator()(
|
||||
int tid, int tn, int slice, int maxSliceSize,
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
|
||||
) {
|
||||
static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
|
||||
static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
|
||||
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
int nNodes = ncclShmem.comm.nNodes;
|
||||
int nRails = direct->nHeads;
|
||||
int bid = args->bid;
|
||||
void* inbuf = (void*)args->sendbuff;
|
||||
ssize_t sizePerRank = args->count;
|
||||
|
||||
ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
|
||||
int railAllSize = railAllEnd - railAllBeg;
|
||||
if (tid < nDsts) dstSizes[tid] = railAllSize;
|
||||
|
||||
int dst = 0;
|
||||
int rail;
|
||||
if (!ReduceSendNotRecv) {
|
||||
rail = direct->headRank;
|
||||
} else {
|
||||
rail = direct->headRank+1;
|
||||
if (rail == nRails) rail = 0;
|
||||
}
|
||||
do {
|
||||
int node = railAllBeg/sizePerRank;
|
||||
int railAllOffset = 0;
|
||||
while (railAllOffset < railAllSize) {
|
||||
ssize_t railOneBeg = node*sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg;
|
||||
int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
|
||||
int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
|
||||
ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
|
||||
reduceCopy<ncclCollUnroll(), RedOp, T,
|
||||
/*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
|
||||
/*MultimemDsts,MinDsts,MaxDsts=*/0,1,1,
|
||||
/*PreOpSrcs=*/1>
|
||||
(tid, tn, args->redOpArg, &args->redOpArg, false,
|
||||
/*nSrcs=*/1+nSrcs, [=]__device__(int s) {
|
||||
return s==0 ? (T*)inbuf + userOneBeg
|
||||
: (T*)srcPtrs[s-1] + railAllOffset;
|
||||
},
|
||||
/*nDsts=*/1, [=]__device__(int d/*==0*/) {
|
||||
return (T*)dstPtrs[dst] + railAllOffset;
|
||||
},
|
||||
delta);
|
||||
railAllOffset += delta;
|
||||
node += 1;
|
||||
}
|
||||
dst += 1;
|
||||
rail += 1;
|
||||
if (rail == nRails) rail = 0;
|
||||
} while (ReduceSendNotRecv && dst < nRails-1);
|
||||
}
|
||||
};
|
||||
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
int tid = threadIdx.x;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
|
||||
int const &nNodes = ncclShmem.comm.nNodes;
|
||||
ssize_t chunkSize = int(args->chunkCount);
|
||||
ssize_t sizePerRank = args->count;
|
||||
|
||||
// if (direct->out == -1) __trap();
|
||||
bool isMultiRail = (direct->nHeads > 1);
|
||||
int nWarps1 = (isMultiRail ? 2 : 0);
|
||||
int nWarps2 = (isMultiRail ? 2 : 1);
|
||||
int nWarps3 = 1;
|
||||
float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
|
||||
nWarps3 = int(denom*nWarps3);
|
||||
nWarps2 = int(denom*nWarps2);
|
||||
nWarps1 = args->nWarps - (nWarps2+nWarps3);
|
||||
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
|
||||
int tn = nWarps1*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 1: Scatter inputs to peers
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr,
|
||||
args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*ReduceSendNotRecv=*/true> scat;
|
||||
scat.args = args;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/0, /*Send=*/1>(scat);
|
||||
}
|
||||
return;
|
||||
}
|
||||
tid -= tn;
|
||||
|
||||
tn = nWarps2*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 2: Reduce from peers + local input -> send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, direct->heads+1, &direct->out, nullptr, nullptr,
|
||||
args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*ReduceSendNotRecv=*/false> scat;
|
||||
scat.args = args;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
}
|
||||
return;
|
||||
}
|
||||
tid -= tn;
|
||||
|
||||
tn = nWarps3*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 3: recv from network
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, &direct->out, nullptr, nullptr, args->recvbuff,
|
||||
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
ssize_t railAllBeg = railGridOffset + args->bid*chunkSize;
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railOneBeg = ncclShmem.comm.node*sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
ssize_t beg = max(railAllBeg, railOneBeg);
|
||||
ssize_t end = min(railAllEnd, railOneEnd);
|
||||
prims.recv(beg-railOneBeg, max(ssize_t(0), end-beg), /*postOp=*/true);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -85,7 +85,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
|
||||
int const peer = args->peer;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto, 1> prims
|
||||
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T));
|
||||
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (isNpKitThread) {
|
||||
@@ -106,7 +106,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
int nelem = min(size_t(chunkSize), count-offset);
|
||||
prims.directSend(offset, offset, nelem);
|
||||
offset += nelem;
|
||||
} while(offset < count);
|
||||
} while(offset < count && args->reg == 0);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT)
|
||||
if (isNpKitThread) {
|
||||
@@ -147,7 +147,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
|
||||
int const peer = args->peer;
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto, 1> prims
|
||||
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T));
|
||||
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (isNpKitThread) {
|
||||
@@ -168,7 +168,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
int nelem = min(size_t(chunkSize), count-offset);
|
||||
prims.directRecv(offset, nelem);
|
||||
offset += nelem;
|
||||
} while(offset < count);
|
||||
} while(offset < count && args->reg == 0);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT)
|
||||
if (isNpKitThread) {
|
||||
@@ -230,4 +230,4 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
+808
-476
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -22,6 +22,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
|
||||
int localRanks = comm->topo->nodes[GPU].count;
|
||||
int nChannels = comm->nChannels;
|
||||
|
||||
topoRanks->nvlsHeadNum = 0;
|
||||
for (int c=0; c<nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
channel->ring.prev = channel->ring.next = -1;
|
||||
@@ -33,20 +34,20 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
|
||||
channel->collnetDirect.headRank = -1;
|
||||
channel->collnetDirect.nHeads = 0;
|
||||
channel->collnetDirect.shift = 0;
|
||||
for (int i=0; i<NCCL_MAX_DIRECT_ARITY+1; i++) channel->collnetDirect.heads[i] = -1;
|
||||
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
|
||||
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
|
||||
|
||||
int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
|
||||
int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
|
||||
int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
|
||||
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra+c*localRanks;
|
||||
|
||||
for (int i=0; i<localRanks; i++) {
|
||||
if (ringIntra[i] == rank) {
|
||||
topoRanks->ringRecv[c] = ringIntra[0];
|
||||
topoRanks->ringSend[c] = ringIntra[localRanks-1];
|
||||
channel->ring.prev = (i == 0) ? -1 : ringIntra[i-1];
|
||||
channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
|
||||
topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1];
|
||||
topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1];
|
||||
}
|
||||
if (treeIntra[i] == rank) {
|
||||
int parentIndex = 0;
|
||||
@@ -64,14 +65,28 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
|
||||
channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1];
|
||||
}
|
||||
}
|
||||
topoRanks->ringPrev[c] = channel->ring.prev;
|
||||
topoRanks->ringNext[c] = channel->ring.next;
|
||||
topoRanks->nvlsHeads[c] = nvlsIntra[0];
|
||||
}
|
||||
// Duplicate channels rings/trees
|
||||
// Duplicate channels trees
|
||||
struct ncclChannel* channel0 = comm->channels;
|
||||
struct ncclChannel* channel1 = (nChannels > MAXCHANNELS/2) ? 0 : channel0+nChannels;
|
||||
if (channel1) memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
|
||||
|
||||
// Get nvls heads and the number of heads. Duplicate head is not allowed.
|
||||
for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
|
||||
bool addHead = true;
|
||||
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
|
||||
|
||||
for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
|
||||
if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
|
||||
addHead = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (addHead) {
|
||||
topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
|
||||
}
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -203,26 +218,14 @@ static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ring
|
||||
int* send = ringSend+c*comm->nNodes;
|
||||
int* prev = ringPrev+c*comm->nRanks;
|
||||
int* next = ringNext+c*comm->nRanks;
|
||||
struct ncclChannel* channel0 = comm->channels+c;
|
||||
struct ncclChannel* channel1 = (nChannels > MAXCHANNELS/2) ? 0 : channel0+nChannels;
|
||||
for (int n=0; n<nNodes; n++) {
|
||||
int recvRank = recv[n];
|
||||
int prevSendRank = send[(n-1+nNodes)%nNodes];
|
||||
prev[recvRank] = prevSendRank;
|
||||
if (comm->rank == recvRank) {
|
||||
channel0->ring.prev = prevSendRank;
|
||||
if (channel1) channel1->ring.prev = prevSendRank;
|
||||
}
|
||||
int sendRank = send[n];
|
||||
int nextRecvRank = recv[(n+1)%nNodes];
|
||||
next[sendRank] = nextRecvRank;
|
||||
if (comm->rank == sendRank) {
|
||||
channel0->ring.next = nextRecvRank;
|
||||
if (channel1) channel1->ring.next = nextRecvRank;
|
||||
}
|
||||
}
|
||||
TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next);
|
||||
if (channel1) TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next);
|
||||
}
|
||||
|
||||
// [RCCL] Print off the recv/send local ranks per node, per channel
|
||||
@@ -404,6 +407,15 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
channel->collnetDirect.up[nUp++] = heads[h];
|
||||
sprintf(line+strlen(line), " %d ", heads[h]);
|
||||
}
|
||||
sprintf(line+strlen(line), "heads ");
|
||||
{ // heads[] is the list of heads ordered in head order startubg with self
|
||||
int h0 = (channel->collnetDirect.headRank == -1) ? 0 : channel->collnetDirect.headRank;
|
||||
for (int h1=0; h1 < nHeads; h1++) {
|
||||
int h = (h0+h1)%nHeads;
|
||||
channel->collnetDirect.heads[h1] = heads[h];
|
||||
sprintf(line+strlen(line), " %d ", heads[h]);
|
||||
}
|
||||
}
|
||||
channel->collnetDirect.nHeads = nHeads;
|
||||
channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
|
||||
channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
|
||||
@@ -412,27 +424,22 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
channel->collnetChain.depth = comm->nRanks/comm->nNodes;
|
||||
}
|
||||
for (int c=0; c<comm->nvlsChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
if (channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
|
||||
}
|
||||
free(heads);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct ncclTopoGraph* nvlsGraph) {
|
||||
int nHeads = nvlsGraph->nChannels;
|
||||
static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHeads) {
|
||||
int headRank = -1;
|
||||
for (int h=0; h<nHeads; h++) {
|
||||
if (nvlsGraph->intra[h*comm->localRanks] == comm->rank) headRank = h;
|
||||
}
|
||||
|
||||
if (nHeads == 0) {
|
||||
comm->nvlsChannels = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
for (int c=0; c<comm->nvlsChannels; c++) {
|
||||
for (int h = 0; h < nHeads; h++) {
|
||||
if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h;
|
||||
}
|
||||
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
channel->nvls.nHeads = nHeads;
|
||||
for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
|
||||
@@ -443,8 +450,10 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct nc
|
||||
channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
|
||||
channel->nvls.node = comm->node;
|
||||
channel->nvls.nNodes = comm->nNodes;
|
||||
if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
|
||||
}
|
||||
if (comm->nNodes == 1) return ncclSuccess;
|
||||
// MNNVL: NVLS not yet supported
|
||||
if (comm->nNodes == 1 || comm->MNNVL) return ncclSuccess;
|
||||
|
||||
// Connect Trees
|
||||
int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
|
||||
@@ -485,7 +494,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct nc
|
||||
}
|
||||
// Set prev/next in all channels (NVLS compute channels work
|
||||
// orthogonally to NVLS search channels).
|
||||
for (int c=0; c<comm->nvlsChannels; c++) {
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
channel->nvls.treeUp = treeUp[c%2];
|
||||
channel->nvls.treeDown[0] = channel->nvls.down;
|
||||
@@ -543,12 +552,19 @@ static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev
|
||||
return c;
|
||||
}
|
||||
|
||||
void exchangeValues(int* v0, int* v1) {
|
||||
int tmp = *v1;
|
||||
*v1 = *v0;
|
||||
*v0 = tmp;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc) {
|
||||
// Gather data from all ranks
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
|
||||
int nranks = comm->nRanks;
|
||||
int nNodes = comm->nNodes;
|
||||
int nChannels = comm->nChannels;
|
||||
int minHeadNum = INT_MAX;
|
||||
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
|
||||
@@ -557,6 +573,22 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
|
||||
|
||||
// Alternate rings to avoid crossing rails
|
||||
if (graphs[NCCL_ALGO_RING]->crossNic && (comm->nNodes % 2) == 0 && (nChannels % 2) == 0) {
|
||||
for (int r=0; r<comm->nRanks; r++) {
|
||||
if (comm->rankToNode[r] % 2 == 1) {
|
||||
// Exchange rings
|
||||
for (int c=0; c<nChannels; c+=2) {
|
||||
exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
|
||||
exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
|
||||
exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
|
||||
exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int c=0; c<nChannels;c++) {
|
||||
for (int n=0; n<nNodes; n++) {
|
||||
int r = firstRanks[n];
|
||||
@@ -571,17 +603,23 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
|
||||
}
|
||||
}
|
||||
for (int c=0; c<graphs[NCCL_ALGO_NVLS]->nChannels; c++) {
|
||||
for (int n=0; n<nNodes; n++) {
|
||||
|
||||
for (int n = 0; n < nNodes; n++) {
|
||||
int r = firstRanks[n];
|
||||
if (minHeadNum > allTopoRanks[r]->nvlsHeadNum)
|
||||
minHeadNum = allTopoRanks[r]->nvlsHeadNum;
|
||||
}
|
||||
|
||||
for (int c = 0; c < minHeadNum; c++) {
|
||||
for (int n = 0; n < nNodes; n++) {
|
||||
int r = firstRanks[n];
|
||||
nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c];
|
||||
nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
|
||||
}
|
||||
}
|
||||
|
||||
// Connect rings and trees. This should also duplicate the channels.
|
||||
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
|
||||
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
|
||||
NCCLCHECK(connectNvls(comm, nvlsHeads, graphs[NCCL_ALGO_NVLS]));
|
||||
|
||||
// Only use full MAXCHANNELS for gfx94x
|
||||
int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? MAXCHANNELS : (MAXCHANNELS/2);
|
||||
@@ -595,6 +633,14 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
nc = std::min(maxNchannels/comm->nChannels, nc);
|
||||
nc *= comm->nChannels;
|
||||
|
||||
// Set ring prev/next for my rank
|
||||
for (int c=0; c<nChannels; c++) {
|
||||
struct ncclChannel* channel0 = comm->channels+c;
|
||||
struct ncclChannel* channel1 = channel0+nChannels;
|
||||
channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank];
|
||||
channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank];
|
||||
}
|
||||
|
||||
// Duplication should be complete now
|
||||
nChannels = comm->nChannels = std::min(maxChannels, (nChannels <= maxChannels/2) ? nChannels*2 : nChannels);
|
||||
|
||||
@@ -633,6 +679,13 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(minNchannels, std::max(nc, comm->config.minCTAs)), ringPrev, ringNext);
|
||||
}
|
||||
|
||||
comm->collChannels = comm->nChannels;
|
||||
// Support maximal channel usage for aggregation
|
||||
if (comm->nChannels < comm->nvlsChannels) {
|
||||
nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
|
||||
}
|
||||
NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
|
||||
|
||||
// Create rings array and check all is fine
|
||||
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
|
||||
|
||||
@@ -646,4 +699,4 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
free(nvlsHeads);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
@@ -349,6 +349,23 @@ compare:
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// MNNVL: Check whether peers are in the same fabric cluster and clique
|
||||
ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret) {
|
||||
*ret = 0;
|
||||
|
||||
nvmlGpuFabricInfoV_t *fabricInfo1 = &info1->fabricInfo;
|
||||
nvmlGpuFabricInfoV_t *fabricInfo2 = &info2->fabricInfo;
|
||||
// A zero UUID means we don't have MNNVL fabric info
|
||||
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
|
||||
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
|
||||
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
|
||||
INFO(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
|
||||
info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
|
||||
*ret = 1;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
|
||||
int ncclTopoUserGdrLevel = -1;
|
||||
|
||||
@@ -779,7 +796,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
}
|
||||
|
||||
comm->localRanks = system->nodes[GPU].count;
|
||||
if (system->nodes[GPU].count == comm->nRanks && remove) {
|
||||
if ((system->nodes[GPU].count == comm->nRanks && remove) || comm->MNNVL) {
|
||||
for (int n=system->nodes[NET].count-1; n>=0; n--)
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
|
||||
}
|
||||
@@ -794,11 +811,12 @@ void ncclTopoFree(struct ncclTopoSystem* system) {
|
||||
free(system);
|
||||
}
|
||||
|
||||
NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 1);
|
||||
NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", -1);
|
||||
NCCL_PARAM(NChannelsPerPeer, "NCHANNELS_PER_PEER", -2);
|
||||
|
||||
static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
|
||||
static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gpu index*/, int peerRank, int* nChannels) {
|
||||
int peer;
|
||||
struct ncclTopoSystem* system = comm->topo;
|
||||
struct ncclTopoLinkList* path = NULL;
|
||||
if (ncclTopoRankToIndex(system, peerRank, &peer) == ncclSuccess) {
|
||||
// Same rank
|
||||
@@ -814,9 +832,28 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
|
||||
} else {
|
||||
*nChannels = 2;
|
||||
}
|
||||
} else if (comm->MNNVL) {
|
||||
// MNNVL assume all GPUs are connected via NVLink
|
||||
path = system->nodes[GPU].nodes[g].paths[GPU]+((g+1)%system->nodes[GPU].count);
|
||||
float nvlBw = ncclTopoNVLinkBw(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
|
||||
*nChannels = 2*std::max(1, (int)(path->bw / nvlBw));
|
||||
} else {
|
||||
// Remote rank, use network
|
||||
*nChannels = ncclParamNChannelsPerNetPeer();
|
||||
int nNetChannels = ncclParamNChannelsPerNetPeer();
|
||||
if (nNetChannels == -1) {
|
||||
//start from 2 channels per NIC and reduce with scale
|
||||
nNetChannels = 2;
|
||||
|
||||
// check if we need to use more than one NIC, hence more than one channel
|
||||
int netCountByBw = 1, nChannelsMax = nNetChannels;
|
||||
NCCLCHECK(getLocalNetCountByBw(system, g, &netCountByBw));
|
||||
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
|
||||
while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
|
||||
|
||||
//allow upto channels requires to drive the NICs
|
||||
nNetChannels = std::max(netCountByBw, nChannelsMax);
|
||||
}
|
||||
*nChannels = nNetChannels;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -845,7 +882,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
for (int g=0; g<comm->topo->nodes[GPU].count; g++) {
|
||||
for (int r=0; r<comm->nRanks; r++) {
|
||||
int nChannels;
|
||||
NCCLCHECK(ncclTopoGetNchannels(comm->topo, g, r, &nChannels));
|
||||
NCCLCHECK(ncclTopoGetNchannels(comm, g, r, &nChannels));
|
||||
if (nChannels >= 0) minChannels = std::min(minChannels, nChannels);
|
||||
}
|
||||
}
|
||||
@@ -907,4 +944,4 @@ int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) {
|
||||
}
|
||||
}
|
||||
return minPath >= PATH_PIX ? 0 : 1;
|
||||
}
|
||||
}
|
||||
@@ -404,13 +404,12 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
|
||||
return ncclSuccess;
|
||||
}
|
||||
// 2. Try to get better bandwidth
|
||||
// Give a 15% perf bonus to paths not crossing nics
|
||||
float target = 1.0 - (refGraph->crossNic - graph->crossNic) * .15;
|
||||
if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra*target) {
|
||||
// Give a 5% perf bonus to paths not crossing nics
|
||||
if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) {
|
||||
*copy = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra*target) return ncclSuccess;
|
||||
if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess;
|
||||
|
||||
// 3. Less hops
|
||||
if (graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
|
||||
@@ -520,6 +519,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
|
||||
if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
|
||||
if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
|
||||
|
||||
// Balanced Tree : count half of the bandwidth on first two GPUs
|
||||
int nextBackToNet = -1;
|
||||
@@ -591,6 +591,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
if (graph->collNet && net->net.collSupport == 0) continue;
|
||||
if (net->net.bw < bw) continue;
|
||||
if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue;
|
||||
|
||||
graph->inter[graph->nChannels*2] = net->id;
|
||||
graph->latencyInter = net->net.latency;
|
||||
@@ -1180,16 +1181,29 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
|
||||
|
||||
#include "comm.h"
|
||||
// NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
|
||||
ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int* dev) {
|
||||
ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int channelId, int* dev) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int localRanks = comm->topo->nodes[GPU].count;
|
||||
for (int c=0; c<graph->nChannels; c++) {
|
||||
if (graph->intra[c*localRanks] == comm->rank) {
|
||||
*dev = graph->inter[c*2];
|
||||
return ncclSuccess;
|
||||
int netNum = 0;
|
||||
int net[MAXCHANNELS];
|
||||
|
||||
for (int c = 0; c < graph->nChannels; c++) {
|
||||
if (graph->intra[c * localRanks] == comm->rank) {
|
||||
net[netNum++] = graph->inter[c * 2];
|
||||
}
|
||||
}
|
||||
if (netNum) {
|
||||
*dev = net[channelId % netNum];
|
||||
} else {
|
||||
ret = ncclInternalError;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
|
||||
return ncclInternalError;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
|
||||
@@ -1204,7 +1218,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
|
||||
if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
|
||||
*dev = graph->inter[channel*2+index];
|
||||
} else {
|
||||
NCCLCHECK(getNvlsNetDev(comm, graph, dev));
|
||||
NCCLCHECK(getNvlsNetDev(comm, graph, channelId, dev));
|
||||
}
|
||||
NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
|
||||
} else if (peerRank == -1) {
|
||||
|
||||
@@ -186,12 +186,17 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
|
||||
// even though they're supposed to sustain full BW across all ports.
|
||||
// Flatten the switch as this extra level can break the search and make
|
||||
// NCCL take wrong topology decisions.
|
||||
int getBcmGen(uint64_t id, int level) {
|
||||
if ((id & 0xfffffffffffff000) == 0x1000c0101000a000) return 4;
|
||||
if ((id & 0xfffffffffffff000) == (0x1000c03010000000 | level*0x1000)) return 5;
|
||||
return 0;
|
||||
}
|
||||
ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
for (int s=0; s<system->nodes[PCI].count; s++) {
|
||||
struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
|
||||
uint64_t device = pciSwitch->pci.device;
|
||||
// Only flatten PEX Gen 4 switches in base mode
|
||||
if ((device & 0xfffffffffffff000) == 0x1000c0101000a000) {
|
||||
int gen = getBcmGen(pciSwitch->pci.device, 0);
|
||||
// Flatten Gen4 PEX switches in base mode
|
||||
if (gen) {
|
||||
// Find sub switches with the same device ID.
|
||||
int64_t* subSwIds;
|
||||
NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks));
|
||||
@@ -199,7 +204,7 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
for (int l=0; l<pciSwitch->nlinks; l++) {
|
||||
struct ncclTopoNode* sub = pciSwitch->links[l].remNode;
|
||||
// Only fuse sub switches with the same device ID.
|
||||
if (sub->type != PCI || sub->pci.device != device) continue;
|
||||
if (sub->type != PCI || getBcmGen(sub->pci.device, 1) != gen) continue;
|
||||
// Save sub switch for later
|
||||
subSwIds[subs++] = sub->id;
|
||||
// Remove link to that sub switch
|
||||
@@ -231,8 +236,8 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
}
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
|
||||
}
|
||||
// Set subdevice to 0x0000 to make sure we don't merge this switch again.
|
||||
pciSwitch->pci.device = 0x1000c01010000000;
|
||||
// Set subdevice to 0xffff to make sure we don't merge this switch again.
|
||||
pciSwitch->pci.device |= 0xffff;
|
||||
free(subSwIds);
|
||||
// Restart, as system->nodes[PCI].nodes has changed.
|
||||
s = 0;
|
||||
@@ -816,6 +821,30 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) {
|
||||
int localNetCount = 0, netCountByBw = 0;
|
||||
int* localNets;
|
||||
float totalNetBw = 0, gpuBw = 0;
|
||||
|
||||
for (int l=0; l<system->nodes[GPU].nodes[gpu].nlinks; l++) {
|
||||
//assuming BW to CPU reflects the GPU bandwidth via P2P or C2C
|
||||
//caveat, this could be wrong if there is a PCIe switch,
|
||||
//and a narrower link to the CPU
|
||||
if (system->nodes[GPU].nodes[gpu].links[l].remNode->type == CPU) {
|
||||
gpuBw = system->nodes[GPU].nodes[gpu].links[l].bw;
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
|
||||
for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) {
|
||||
totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw;
|
||||
}
|
||||
*count = netCountByBw;
|
||||
|
||||
free(localNets);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
|
||||
int gpu;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
|
||||
@@ -845,17 +874,25 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
|
||||
int netIndex;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, net, &netIndex));
|
||||
int* localGpus = NULL;
|
||||
int localGpuCount;
|
||||
NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
for (int lg=0; lg<localGpuCount; lg++) {
|
||||
int g = localGpus[lg];
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
int id;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id));
|
||||
if (net == id) {
|
||||
*gpuIndex = g;
|
||||
free(localGpus);
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
free(localGpus);
|
||||
*gpuIndex = -1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -963,4 +1000,4 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
|
||||
if (ccMin) *ccMin = min;
|
||||
if (ccMax) *ccMax = max;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
@@ -17,7 +17,7 @@
|
||||
#define SM60_NVLINK_BW 18.0
|
||||
#define SM70_NVLINK_BW 20.0
|
||||
#define SM80_NVLINK_BW 20.0
|
||||
#define SM90_NVLINK_BW 20.0
|
||||
#define SM90_NVLINK_BW 20.6
|
||||
#define SM86_NVLINK_BW 12.0
|
||||
#define PCI_BW 12.0 // PCI Gen3 x16
|
||||
#define QPI_BW 6.0
|
||||
@@ -246,6 +246,17 @@ static float ncclTopoXGMISpeed(const char* gcn) {
|
||||
#define ncclGetKernelIndex(p_comm) (0)
|
||||
#endif
|
||||
|
||||
// Returns NVLink bw in GB/s
|
||||
static float ncclTopoNVLinkBw(int cudaCompCap) {
|
||||
return
|
||||
cudaCompCap >= 90 ? SM90_NVLINK_BW :
|
||||
cudaCompCap == 86 ? SM86_NVLINK_BW :
|
||||
cudaCompCap >= 80 ? SM80_NVLINK_BW :
|
||||
cudaCompCap >= 70 ? SM70_NVLINK_BW :
|
||||
cudaCompCap >= 60 ? SM60_NVLINK_BW :
|
||||
SM80_NVLINK_BW;
|
||||
}
|
||||
|
||||
// Mirror bits
|
||||
static bool isPow2(int val) {
|
||||
return (val & (val-1)) == 0;
|
||||
|
||||
@@ -325,7 +325,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
|
||||
#endif
|
||||
|
||||
int nNodes = comm->nNodes;
|
||||
// MNNVL support - treat as a single NVLink connected node
|
||||
int nNodes = comm->MNNVL ? 1 : comm->nNodes;
|
||||
int nRanks = comm->nRanks;
|
||||
if (nRanks <= 1) return ncclSuccess;
|
||||
|
||||
@@ -358,8 +359,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
|
||||
if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
|
||||
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
|
||||
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
|
||||
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
|
||||
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue;
|
||||
@@ -388,20 +389,39 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
|
||||
// Collnet+Direct requires all GPUs to have a local NIC to work at full speed
|
||||
float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio
|
||||
factor -= (factor-1)/2;
|
||||
busBw /= factor;
|
||||
if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
|
||||
busBw = ppn * bw;
|
||||
// AllGather/ReduceScatter requires 1:1 GPU:NIC
|
||||
int nicPerNode = comm->collNetHeadsUniqueNum;
|
||||
if (coll == ncclFuncAllGather && comm->nNodes > 1) {
|
||||
if (!comm->ncclCollNet || !comm->ncclCollNet->iallgather || ppn > nicPerNode) busBw = 0;
|
||||
}
|
||||
if (coll == ncclFuncReduceScatter && comm->nNodes > 1) {
|
||||
if (!comm->ncclCollNet || !comm->ncclCollNet->ireducescatter || ppn > nicPerNode) busBw = 0;
|
||||
}
|
||||
// Measured corrective ratio needed at 1 ppn and 8ppn. Here we hackishly
|
||||
// interpolate the two.
|
||||
float w = (ppn-1)/(8-1);
|
||||
busBw *= w*0.85 + (1-w)*0.95;
|
||||
} else {
|
||||
// Collnet+Direct requires all GPUs to have a local NIC to work at full speed
|
||||
float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio
|
||||
factor -= (factor-1)/2;
|
||||
busBw /= factor;
|
||||
if (minCompCap >= 90) busBw *= .85;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE && minCompCap >= 90) busBw *= .85;
|
||||
|
||||
// Convert bus BW to algorithm BW
|
||||
float ratio;
|
||||
if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
|
||||
else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio = 5.0/6.0;
|
||||
else ratio = .5;
|
||||
comm->bandwidths[coll][a][p] = busBw * ratio;
|
||||
if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
|
||||
float ratio = 1.0f;
|
||||
if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
|
||||
else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
|
||||
else ratio *= .5;
|
||||
busBw *= ratio;
|
||||
}
|
||||
comm->bandwidths[coll][a][p] = busBw;
|
||||
/* Ring bandwidth backup */
|
||||
if (a == NCCL_ALGO_RING)
|
||||
comm->ringbdw[coll][p] = comm->bandwidths[coll][NCCL_ALGO_RING][p];
|
||||
@@ -464,18 +484,19 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
|
||||
}
|
||||
|
||||
if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
|
||||
// MNNVL: NVLS not yet supported
|
||||
if (comm->nNodes == 1 || comm->MNNVL) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
|
||||
|
||||
// Disable CollNet if it is not supported
|
||||
if (comm->collNetSupport == 0) {
|
||||
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
|
||||
algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
|
||||
if (comm->nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0;
|
||||
// MNNVL: NVLS not yet supported
|
||||
if (comm->nNodes > 1 || comm->MNNVL) algoEnable[NCCL_ALGO_NVLS] = 0;
|
||||
// If user has hard set NCCL_ALGO=COLLNET, ignore it
|
||||
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 &&
|
||||
algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) {
|
||||
algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
|
||||
if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
|
||||
}
|
||||
} else {
|
||||
// Disable CollNet+Direct if not on an NVSwitch system
|
||||
@@ -611,9 +632,9 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup) {
|
||||
float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
|
||||
float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
|
||||
float lat = info->comm->latencies[info->coll][algorithm][protocol];
|
||||
|
||||
|
||||
if (backup) {
|
||||
*backup = false;
|
||||
if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
|
||||
@@ -640,7 +661,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
|
||||
#else
|
||||
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
|
||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
|
||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && (!info->comm->MNNVL && info->comm->nNodes > 1)
|
||||
&& info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) {
|
||||
lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
|
||||
}
|
||||
@@ -649,4 +670,4 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
|
||||
int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
|
||||
*time = lat * latCount + (info->nBytes) / (1000 * bw);
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
@@ -155,7 +155,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
}
|
||||
|
||||
while (true) { // Iterate rounds of launches for clique.
|
||||
bool moreRounds;
|
||||
bool moreRounds = false;
|
||||
comm = cliqueHead;
|
||||
do { // Iterate clique members.
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
@@ -163,7 +163,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
// Barrier reduction result tells us if this was the final round.
|
||||
moreRounds = 0 != ncclCommIntraBarrierOut(comm);
|
||||
} else {
|
||||
moreRounds = comm->unlaunchedPlansHead != nullptr;
|
||||
moreRounds |= comm->unlaunchedPlansHead != nullptr;
|
||||
}
|
||||
if (moreRounds) {
|
||||
// Pop next unlaunched kernel
|
||||
@@ -248,9 +248,9 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
|
||||
// Reset comm->tasks to empty.
|
||||
comm->tasks.nTasksColl = 0;
|
||||
comm->tasks.nTasksP2p = 0;
|
||||
comm->tasks.workBytesTotal = 0;
|
||||
comm->tasks.streams = nullptr;
|
||||
ncclIntruQueueConstruct(&comm->tasks.collQueue);
|
||||
comm->tasks.collBytesTotal = 0;
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
|
||||
ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
|
||||
@@ -334,9 +334,9 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
|
||||
assert(state == ncclGroupJobJoined);
|
||||
}
|
||||
|
||||
if (*groupAbortFlag == true || errorJobAbortFlag == true) {
|
||||
*job->abortFlag = 1;
|
||||
if (job->childAbortFlag) *job->childAbortFlag = 1;
|
||||
if (__atomic_load_n(groupAbortFlag, __ATOMIC_RELAXED) || errorJobAbortFlag == true) {
|
||||
__atomic_store_n(job->abortFlag, 1, __ATOMIC_RELAXED);
|
||||
if (job->childAbortFlag) __atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
job = job->next;
|
||||
@@ -455,7 +455,7 @@ ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {
|
||||
|
||||
ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
|
||||
if (groupJob && groupJob->initialized) {
|
||||
*groupJob->abortFlagPtr = true;
|
||||
__atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELAXED);
|
||||
NCCLCHECK(ncclGroupJobComplete(groupJob));
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -109,13 +109,14 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
|
||||
CUmemAllocationProp prop = {};
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
CUmemGenericAllocationHandle handle;
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
int cudaDev;
|
||||
int flag = 0;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported
|
||||
prop.requestedHandleTypes = type;
|
||||
prop.location.id = currentDev;
|
||||
// Query device to see if RDMA support is available
|
||||
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
|
||||
|
||||
@@ -12,5 +12,6 @@
|
||||
|
||||
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
|
||||
ncclResult_t ArgsCheck(struct ncclInfo* info);
|
||||
ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -19,9 +19,9 @@ static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNet
|
||||
static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
|
||||
static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
|
||||
static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
|
||||
static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
|
||||
/* DMA-BUF support */
|
||||
static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
|
||||
NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
|
||||
|
||||
@@ -53,4 +53,15 @@ inline int ncclTypeSize(ncclDataType_t type) {
|
||||
}
|
||||
}
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
#define NCCL_MODE_NORMAL 0
|
||||
#define NCCL_MODE_OFFSET 1
|
||||
#define NCCL_MODE_PTR 2
|
||||
struct ncclConnFifo {
|
||||
int mode;
|
||||
int offset;
|
||||
ssize_t size;
|
||||
void* ptr;
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "proxy.h"
|
||||
#include "strongstream.h"
|
||||
#include "nccl_net.h"
|
||||
#include "register.h"
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define HIPRT_CB
|
||||
@@ -59,6 +60,7 @@ struct ncclRecvMem {
|
||||
struct {
|
||||
uint64_t tail;
|
||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
struct ncclConnFifo connFifo[NCCL_STEPS];
|
||||
int sizesFifo[NCCL_STEPS];
|
||||
int offsFifo[NCCL_STEPS];
|
||||
int flush; // For GDRCopy-based flush
|
||||
@@ -174,7 +176,6 @@ struct ncclKernelPlan {
|
||||
// A kernel plan is also a callback that reclaims itself. Hence this must
|
||||
// be the first member.
|
||||
struct ncclCommCallback reclaimer;
|
||||
struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup
|
||||
|
||||
struct ncclComm* comm;
|
||||
struct ncclKernelPlan* next;
|
||||
@@ -205,23 +206,7 @@ struct ncclKernelPlan {
|
||||
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
|
||||
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
|
||||
} channels[MAXCHANNELS];
|
||||
};
|
||||
|
||||
struct ncclRegRequest {
|
||||
uintptr_t buff;
|
||||
size_t size;
|
||||
struct ncclRegRequest *next;
|
||||
};
|
||||
|
||||
struct ncclRegRecord {
|
||||
uintptr_t buff;
|
||||
size_t size;
|
||||
CUdeviceptr regAddr;
|
||||
size_t regSize;
|
||||
int dev;
|
||||
CUmemGenericAllocationHandle mcHandle;
|
||||
uintptr_t *addrs; /* use to check if NVLS buffers match among intra-node ranks */
|
||||
struct ncclRegRecord *next;
|
||||
size_t maxBytesPerChannel;
|
||||
};
|
||||
|
||||
struct ncclComm {
|
||||
@@ -268,6 +253,7 @@ struct ncclComm {
|
||||
int* localRankToRank;
|
||||
// localRanks and localRanktoRank for all nodes
|
||||
struct ncclNodeRanks* nodeRanks;
|
||||
int MNNVL; // MNNVL: Multi-Node NVLink
|
||||
|
||||
bool checkPointers;
|
||||
bool dmaBufSupport;
|
||||
@@ -276,8 +262,9 @@ struct ncclComm {
|
||||
uint64_t opCount;
|
||||
|
||||
// Channels for collectives
|
||||
int nChannels;
|
||||
int nvlsChannels;
|
||||
int nChannels; // connection nChannels
|
||||
int collChannels; // enqueue nChannels
|
||||
int nvlsChannels; // enqueue nChannels
|
||||
int collNetChannels;
|
||||
// Channels (per peer) for p2p
|
||||
int p2pnChannels;
|
||||
@@ -345,6 +332,9 @@ struct ncclComm {
|
||||
int intraHighestTransportType;
|
||||
int* collNetHeads;
|
||||
int collNetHeadsNum;
|
||||
int collNetHeadsUniqueNum;
|
||||
int* collNetDenseToUserRank;
|
||||
int* collNetUserToDenseRank;
|
||||
/* sharable collNet proxy progress resource. */
|
||||
struct ncclCollNetSharedRes* collNetSharedRes;
|
||||
|
||||
@@ -354,8 +344,6 @@ struct ncclComm {
|
||||
/* sharable NVLS resource. */
|
||||
struct ncclNvlsSharedRes* nvlsResources;
|
||||
|
||||
ssize_t channelSize; // User requested work size (bytes) for channel partitions
|
||||
|
||||
// pools backed by comm->memPermanent
|
||||
struct ncclMemoryPool memPool_ncclProxyOp;
|
||||
struct ncclMemoryPool memPool_ncclKernelPlan;
|
||||
@@ -406,13 +394,10 @@ struct ncclComm {
|
||||
// group job to support multi-thread FT
|
||||
struct ncclGroupJob *groupJob;
|
||||
|
||||
/* store to buffer register request */
|
||||
struct ncclIntruQueue<struct ncclRegRequest, &ncclRegRequest::next> regRequestQueue;
|
||||
/* store registered buffer */
|
||||
struct ncclIntruQueue<struct ncclRegRecord, &ncclRegRecord::next> regRecordQueue;
|
||||
|
||||
// Tuning plugin
|
||||
ncclTuner_t* tuner;
|
||||
// buffer registration cache
|
||||
struct ncclRegCache regCache;
|
||||
};
|
||||
|
||||
enum ncclLaunchMode {
|
||||
@@ -496,4 +481,4 @@ static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) {
|
||||
ncclResult_t ncclCommEnsureReady(ncclComm_t comm);
|
||||
ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState);
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@@ -16,6 +16,10 @@ extern int ncclCuMemEnable();
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
#include <cudaTypedefs.h>
|
||||
|
||||
// Handle type used for cuMemCreate()
|
||||
extern CUmemAllocationHandleType ncclCuMemHandleType;
|
||||
|
||||
#else
|
||||
typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
|
||||
typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
|
||||
|
||||
@@ -106,8 +106,7 @@ struct ncclConnInfo {
|
||||
void **ptrExchange; // Pointer exchange for direct communication
|
||||
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
|
||||
|
||||
int *sizesFifo; // Sizes fifo from GPU to proxy
|
||||
int *offsFifo; // Buffer fifo from proxy to GPU
|
||||
struct ncclConnFifo* connFifo; // Used for GPU - Proxy communication
|
||||
|
||||
uint64_t step; // Keep where we are
|
||||
uint64_t llLastCleaning;
|
||||
@@ -167,6 +166,9 @@ struct ncclDirect {
|
||||
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
|
||||
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
|
||||
int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
|
||||
// The heads[...] are guaranteed to be in rotated order start with self:
|
||||
// headRank, (headRank+1)%nHeads, (headRank+2)%nHeads, ...
|
||||
int heads[NCCL_MAX_DIRECT_ARITY+1];
|
||||
int up[NCCL_MAX_DIRECT_ARITY];
|
||||
int down[NCCL_MAX_DIRECT_ARITY];
|
||||
};
|
||||
@@ -229,30 +231,32 @@ struct ncclWorkElem {
|
||||
union {
|
||||
uint8_t flagBits;
|
||||
struct {
|
||||
uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, nWarps:5;
|
||||
uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, oneNode:1;
|
||||
};
|
||||
};
|
||||
uint8_t nWarps;
|
||||
uint8_t direct;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
struct {
|
||||
uint32_t root:28;
|
||||
uint32_t pad_0:2;
|
||||
uint32_t connIndex:2;
|
||||
};
|
||||
|
||||
const void * sendbuff;
|
||||
void * recvbuff;
|
||||
uint32_t root:30, connIndex:2;
|
||||
const void *sendbuff;
|
||||
void *recvbuff;
|
||||
|
||||
size_t count;
|
||||
union {
|
||||
size_t lastChunkSize;
|
||||
// Pivot A2A kernel computes chunk size itself.
|
||||
// Instead, it needs the number of bidirectional rings.
|
||||
size_t pivotA2ANumBiRings;
|
||||
};
|
||||
uint64_t count:39, opCount:25;
|
||||
uint64_t redOpArg;
|
||||
uint64_t opCount;
|
||||
uint64_t chunkCount:25, workCount:39;
|
||||
union {
|
||||
struct {
|
||||
uint64_t lastChunkCount:25;
|
||||
uint64_t workOffset:39;
|
||||
};
|
||||
struct {
|
||||
uint32_t nChannels;
|
||||
uint16_t bid;
|
||||
// Pivot A2A kernel computes chunk size itself.
|
||||
// Instead, it needs the number of bidirectional rings.
|
||||
uint16_t pivotA2ANumBiRings;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
static_assert((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem) == 4, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 4");
|
||||
@@ -265,15 +269,16 @@ struct ncclWorkElemP2p {
|
||||
int32_t proto:2;
|
||||
};
|
||||
union {
|
||||
uint16_t flagBits;
|
||||
uint16_t flagBit;
|
||||
struct {
|
||||
enum ncclWorkP2PType p2pType:4;
|
||||
uint16_t nWarps:4;
|
||||
uint16_t warpStart:4;
|
||||
uint16_t ngroups:4;
|
||||
uint8_t nWarps:4;
|
||||
uint8_t warpStart:4;
|
||||
uint8_t ngroups:4;
|
||||
};
|
||||
};
|
||||
uint16_t opCount;
|
||||
uint8_t reg:1;
|
||||
uint16_t opCount:12;
|
||||
// Important not to use any fields with greater than 4-byte alignment since
|
||||
// we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
|
||||
// there were 8-byte fields.
|
||||
@@ -398,6 +403,8 @@ struct alignas(16) ncclDevChannel {
|
||||
struct ncclDevComm {
|
||||
int rank;
|
||||
int nRanks;
|
||||
int node;
|
||||
int nNodes;
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
int p2pChunkSize;
|
||||
|
||||
@@ -405,6 +412,8 @@ struct ncclDevComm {
|
||||
int workFifoDepth;
|
||||
struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
|
||||
|
||||
int* collNetDenseToUserRank;
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
volatile uint32_t* abortFlag;
|
||||
|
||||
@@ -526,56 +535,55 @@ extern int const ncclDevFuncRowToId[];
|
||||
// `ncclFuncIndex()` needs to be in sync with 'ALL_COLLS' in Generate.cmake
|
||||
inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) {
|
||||
int row = 0;
|
||||
do {
|
||||
// RING / <all_protos> / Sum / int8_t
|
||||
if (coll == ncclFuncAllGather) {
|
||||
row += proto;
|
||||
break;
|
||||
}
|
||||
row += NCCL_NUM_PROTOCOLS;
|
||||
|
||||
// RING / <all_protos> / Sum / int8_t
|
||||
if (coll == ncclFuncAllGather) {
|
||||
row += proto;
|
||||
goto have_row;
|
||||
}
|
||||
row += NCCL_NUM_PROTOCOLS;
|
||||
// <all_algos> / <all_protos> / <all_redops> / <all_types>
|
||||
if (coll == ncclFuncAllReduce) {
|
||||
row += (((algo * NCCL_NUM_PROTOCOLS + proto) * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - NCCL_NUM_FLOATS * (algo * NCCL_NUM_PROTOCOLS + proto);
|
||||
break;
|
||||
}
|
||||
row += (NCCL_NUM_ALGORITHMS - 2) * NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - NCCL_NUM_FLOATS);
|
||||
|
||||
// <all_algos> / <all_protos> / <all_redops> / <all_types>
|
||||
if (coll == ncclFuncAllReduce) {
|
||||
row += (((algo * NCCL_NUM_PROTOCOLS + proto) * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - /*floats for each SumPostDiv*/ 6 * (algo * NCCL_NUM_PROTOCOLS + proto);
|
||||
goto have_row;
|
||||
}
|
||||
row += (NCCL_NUM_ALGORITHMS - 2) * NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - /*floats for each SumPostDiv*/ 6);
|
||||
// RING / SIMPLE / Sum / int8_t
|
||||
if (coll == ncclFuncAllToAllPivot) break;
|
||||
row += 1;
|
||||
|
||||
// RING / SIMPLE / Sum / int8_t
|
||||
if (coll == ncclFuncAllToAllPivot) goto have_row;
|
||||
row += 1;
|
||||
// RING / <all_protos> / Sum / int8_t
|
||||
if (coll == ncclFuncBroadcast) {
|
||||
row += proto;
|
||||
break;
|
||||
}
|
||||
row += NCCL_NUM_PROTOCOLS;
|
||||
|
||||
// RING / <all_protos> / Sum / int8_t
|
||||
if (coll == ncclFuncBroadcast) {
|
||||
row += proto;
|
||||
goto have_row;
|
||||
}
|
||||
row += NCCL_NUM_PROTOCOLS;
|
||||
// RING / <all_protos> / <all_redops> / <all_types>
|
||||
if (coll == ncclFuncReduce) {
|
||||
row += ((proto * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - NCCL_NUM_FLOATS * proto;
|
||||
break;
|
||||
}
|
||||
row += NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - NCCL_NUM_FLOATS);
|
||||
|
||||
// RING / <all_protos> / <all_redops> / <all_types>
|
||||
if (coll == ncclFuncReduce) {
|
||||
row += ((proto * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - /*floats for each SumPostDiv*/ 6 * proto;
|
||||
goto have_row;
|
||||
}
|
||||
row += NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - /*floats for each SumPostDiv*/ 6);
|
||||
// RING / <all_protos> / <all_redops> / <all_types>
|
||||
if (coll == ncclFuncReduceScatter) {
|
||||
row += ((proto * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - NCCL_NUM_FLOATS * proto;
|
||||
break;
|
||||
}
|
||||
row += NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - NCCL_NUM_FLOATS);
|
||||
|
||||
// RING / <all_protos> / <all_redops> / <all_types>
|
||||
if (coll == ncclFuncReduceScatter) {
|
||||
row += ((proto * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - /*floats for each SumPostDiv*/ 6 * proto;
|
||||
goto have_row;
|
||||
}
|
||||
row += NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - /*floats for each SumPostDiv*/ 6);
|
||||
// RING / SIMPLE / Sum / int8_t
|
||||
if (coll == ncclFuncSendRecv) break;
|
||||
row += 1;
|
||||
|
||||
// RING / SIMPLE / Sum / int8_t
|
||||
if (coll == ncclFuncSendRecv) goto have_row;
|
||||
row += 1;
|
||||
} while (false);
|
||||
|
||||
have_row:
|
||||
return ncclDevFuncRowToId[row];
|
||||
}
|
||||
|
||||
inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[FUNC_INDEX_P2P]; }
|
||||
inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[FUNC_INDEX_TOTAL - NCCL_NUM_ONERANK - 1]; }
|
||||
|
||||
inline int ncclDevFuncId_AllToAllPivot() { return ncclDevFuncRowToId[FUNC_INDEX_ALLTOALL_PIVOT]; }
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@@ -12,8 +12,10 @@
|
||||
#include "collectives.h"
|
||||
#include "utils.h"
|
||||
|
||||
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
|
||||
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
|
||||
#define NCCL_LL_ALIGNMENT_PER_THREAD sizeof(uint64_t)
|
||||
#define NCCL_LL128_ALIGNMENT_PER_WARP 480
|
||||
#define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL)
|
||||
#define NCCL_BYTES_ALIGNMENT 16
|
||||
|
||||
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||
|
||||
@@ -34,6 +34,7 @@ int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
|
||||
ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
|
||||
#define MAX_XGMI_INTER_GPUS 4
|
||||
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev);
|
||||
@@ -59,10 +60,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
|
||||
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
|
||||
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
|
||||
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
|
||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
|
||||
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);
|
||||
|
||||
#define NCCL_TOPO_MAX_NODES 256
|
||||
|
||||
@@ -111,6 +113,7 @@ struct ncclTopoRanks {
|
||||
int treeToChild0[MAXCHANNELS];
|
||||
int treeToChild1[MAXCHANNELS];
|
||||
int nvlsHeads[MAXCHANNELS];
|
||||
int nvlsHeadNum;
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "core.h"
|
||||
#include "utils.h"
|
||||
#include "strongstream.h"
|
||||
#define NCCL_MAX_LOCAL_RANKS 64
|
||||
|
||||
typedef enum : uint8_t {
|
||||
ncclPatternRing,
|
||||
@@ -31,6 +32,13 @@ typedef enum : uint8_t {
|
||||
ncclPatternRecv
|
||||
} ncclPattern_t;
|
||||
|
||||
enum ncclRegBufferType {
|
||||
NCCL_REGULAR_BUFFER = 0,
|
||||
NCCL_IPC_REG_BUFFER = 1,
|
||||
NCCL_NVLS_REG_BUFFER = 2,
|
||||
NCCL_REG_BUFFER_NUM = 3
|
||||
};
|
||||
|
||||
// Used to pass NCCL call information between functions
|
||||
struct ncclInfo {
|
||||
ncclFunc_t coll;
|
||||
@@ -49,37 +57,46 @@ struct ncclInfo {
|
||||
int sliceSteps;
|
||||
// Computed later
|
||||
ncclDevRedOpFull opFull;
|
||||
int algorithm;
|
||||
int protocol;
|
||||
ncclPattern_t pattern;
|
||||
int nChannels;
|
||||
int nThreads;
|
||||
size_t nBytes;
|
||||
size_t aggnBytes;
|
||||
size_t workBytes;
|
||||
size_t sendbuffSize;
|
||||
size_t recvbuffSize;
|
||||
int nstepsPerLoop;
|
||||
int nchunksPerLoop;
|
||||
int stepSize;
|
||||
int chunkCount;
|
||||
int chunkSize;
|
||||
int channelId;
|
||||
int workFuncIndex;
|
||||
ncclRegBufferType regBufType;
|
||||
void* regBufSend[NCCL_MAX_LOCAL_RANKS];
|
||||
void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
|
||||
// Need to initialize
|
||||
int nThreads;
|
||||
int nChannels;
|
||||
int algorithm;
|
||||
int protocol;
|
||||
bool userTuned;
|
||||
struct ncclInfo *next;
|
||||
};
|
||||
|
||||
inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
|
||||
info->nBytes = info->count * ncclTypeSize(info->datatype);
|
||||
info->nBytes = info->workBytes = info->count * ncclTypeSize(info->datatype);
|
||||
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAllPivot) {
|
||||
info->count = info->nBytes;
|
||||
info->count = info->workBytes;
|
||||
info->datatype = ncclInt8;
|
||||
}
|
||||
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
|
||||
|
||||
/* compute buffer size for NVLS buffer registration */
|
||||
if (info->coll == ncclFuncAllGather) {
|
||||
info->sendbuffSize = info->count * ncclTypeSize(info->datatype);
|
||||
info->sendbuffSize = info->workBytes;
|
||||
info->recvbuffSize = info->sendbuffSize * nRanks;
|
||||
} else if (info->coll == ncclFuncReduceScatter) {
|
||||
info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
|
||||
info->recvbuffSize = info->workBytes;
|
||||
info->sendbuffSize = info->recvbuffSize * nRanks;
|
||||
} else {
|
||||
info->sendbuffSize = info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
|
||||
info->sendbuffSize = info->recvbuffSize = info->workBytes;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -94,6 +111,7 @@ struct ncclTaskColl {
|
||||
ncclDataType_t datatype;
|
||||
ncclDevRedOpFull op;
|
||||
int chunkSteps, sliceSteps;
|
||||
struct ncclInfo info;
|
||||
};
|
||||
struct ncclTaskP2p {
|
||||
ncclTaskP2p *next;
|
||||
@@ -114,8 +132,16 @@ struct ncclTasks {
|
||||
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
|
||||
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
|
||||
};
|
||||
struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
|
||||
size_t collBytesTotal;
|
||||
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collQueue;
|
||||
// Queue for user-tuned executed collectives
|
||||
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collTunedQueue;
|
||||
// Queue for continuous bytes distribution (CBD) collectives
|
||||
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collCBDQueue;
|
||||
// Queue for collnet
|
||||
struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collnetQueue;
|
||||
size_t workBytesTotal;
|
||||
int usableChannels;
|
||||
bool sorted;
|
||||
struct Peer* peers/*[nRanks]*/;
|
||||
int *p2pSendOrder, *p2pRecvOrder;
|
||||
int p2pOrderSteps;
|
||||
@@ -134,4 +160,4 @@ struct ncclTasks {
|
||||
struct ncclCudaGraph capturingGraph;
|
||||
};
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@@ -35,4 +35,7 @@ ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd);
|
||||
ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
|
||||
ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
|
||||
|
||||
ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash);
|
||||
ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd);
|
||||
|
||||
#endif /* NCCL_IPCSOCKET_H */
|
||||
|
||||
@@ -12,12 +12,22 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv and AllToAllPivot not included for now
|
||||
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
|
||||
#define NCCL_NUM_ONERANK 12
|
||||
#define FUNC_INDEX_TOTAL 980 + NCCL_NUM_ONERANK
|
||||
|
||||
#define FUNC_INDEX_P2P 979
|
||||
#define FUNC_INDEX_ALLTOALL_PIVOT 651
|
||||
#define FUNC_INDEX_TOTAL 992
|
||||
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
|
||||
typedef enum {
|
||||
ncclFuncBroadcast = 0,
|
||||
ncclFuncReduce = 1,
|
||||
ncclFuncAllGather = 2,
|
||||
ncclFuncReduceScatter = 3,
|
||||
ncclFuncAllReduce = 4,
|
||||
ncclFuncSendRecv = 5,
|
||||
ncclFuncSend = 6,
|
||||
ncclFuncRecv = 7,
|
||||
ncclFuncAllToAllPivot = 8,
|
||||
ncclNumFuncs = 9
|
||||
} ncclFunc_t;
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
|
||||
#define NCCL_ALGO_UNDEF -1
|
||||
@@ -34,4 +44,6 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed
|
||||
#define NCCL_PROTO_LL128 1
|
||||
#define NCCL_PROTO_SIMPLE 2
|
||||
|
||||
#endif
|
||||
#define NCCL_NUM_FLOATS 6 // half/float/double/rccl_bfloat16
|
||||
|
||||
#endif
|
||||
@@ -21,6 +21,140 @@
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 32
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int regIsGlobal; // regMr is not tied to a particular comm
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
} ncclNetProperties_v8_t;
|
||||
|
||||
typedef ncclNetProperties_v8_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
} ncclNet_v8_t;
|
||||
|
||||
typedef ncclNet_v8_t ncclNet_t;
|
||||
|
||||
#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v8
|
||||
|
||||
typedef struct {
|
||||
void* mhandle;
|
||||
void* address;
|
||||
uint32_t size;
|
||||
} ncclNetSGE_v8_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
void* sendMhandle, void** request);
|
||||
ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp,
|
||||
void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v8_t;
|
||||
|
||||
typedef ncclCollNet_v8_t ncclCollNet_t;
|
||||
|
||||
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v8
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
@@ -36,8 +170,6 @@ typedef struct {
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
} ncclNetProperties_v7_t;
|
||||
|
||||
typedef ncclNetProperties_v7_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
@@ -93,11 +225,45 @@ typedef struct {
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
} ncclNet_v7_t;
|
||||
|
||||
typedef ncclNet_v7_t ncclNet_t;
|
||||
|
||||
#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v7
|
||||
|
||||
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v7
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v7_t;
|
||||
|
||||
#define NCCL_NET_MAX_REQUESTS_V6 8
|
||||
|
||||
@@ -162,49 +328,6 @@ typedef struct {
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclNet_v6_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v7_t;
|
||||
|
||||
typedef ncclCollNet_v7_t ncclCollNet_t;
|
||||
|
||||
// v6 struct for backwards compatibility
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
|
||||
@@ -24,6 +24,7 @@ typedef struct {
|
||||
int needsProxyProgress;
|
||||
} ncclNetDeviceHandle_v7_t;
|
||||
|
||||
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
|
||||
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
|
||||
typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -20,6 +20,12 @@
|
||||
// Dynamically handle dependencies on NVML
|
||||
|
||||
/* Extracted from nvml.h */
|
||||
|
||||
#define NVML_API_VERSION 12
|
||||
|
||||
#define NVML_STRUCT_VERSION(data, ver) (unsigned int)(sizeof(nvml ## data ## _v ## ver ## _t) | \
|
||||
(ver << 24U))
|
||||
|
||||
typedef struct nvmlDevice_st* nvmlDevice_t;
|
||||
#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
|
||||
|
||||
@@ -181,6 +187,72 @@ typedef struct nvmlFieldValue_st
|
||||
nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
|
||||
} nvmlFieldValue_t;
|
||||
|
||||
|
||||
#define NVML_GPU_FABRIC_UUID_LEN 16
|
||||
|
||||
#define NVML_GPU_FABRIC_STATE_NOT_SUPPORTED 0
|
||||
#define NVML_GPU_FABRIC_STATE_NOT_STARTED 1
|
||||
#define NVML_GPU_FABRIC_STATE_IN_PROGRESS 2
|
||||
#define NVML_GPU_FABRIC_STATE_COMPLETED 3
|
||||
|
||||
typedef unsigned char nvmlGpuFabricState_t;
|
||||
|
||||
typedef struct {
|
||||
unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs
|
||||
nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete".
|
||||
unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs
|
||||
nvmlGpuFabricState_t state; //!< Current state of GPU registration process
|
||||
} nvmlGpuFabricInfo_t;
|
||||
|
||||
#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED 0
|
||||
#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE 1
|
||||
#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE 2
|
||||
|
||||
#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW 0
|
||||
#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW 0x11
|
||||
|
||||
/**
|
||||
* GPU Fabric Health Status Mask for various fields can be obtained
|
||||
* using the below macro.
|
||||
* Ex - NVML_GPU_FABRIC_HEALTH_GET(var, _DEGRADED_BW)
|
||||
*/
|
||||
#define NVML_GPU_FABRIC_HEALTH_GET(var, type) \
|
||||
(((var) >> NVML_GPU_FABRIC_HEALTH_MASK_SHIFT##type) & \
|
||||
(NVML_GPU_FABRIC_HEALTH_MASK_WIDTH##type))
|
||||
|
||||
/**
|
||||
* GPU Fabric Health Status Mask for various fields can be tested
|
||||
* using the below macro.
|
||||
* Ex - NVML_GPU_FABRIC_HEALTH_TEST(var, _DEGRADED_BW, _TRUE)
|
||||
*/
|
||||
#define NVML_GPU_FABRIC_HEALTH_TEST(var, type, val) \
|
||||
(NVML_GPU_FABRIC_HEALTH_GET(var, type) == \
|
||||
NVML_GPU_FABRIC_HEALTH_MASK##type##val)
|
||||
|
||||
/**
|
||||
* GPU Fabric information (v2).
|
||||
*
|
||||
* Version 2 adds the \ref nvmlGpuFabricInfo_v2_t.version field
|
||||
* to the start of the structure, and the \ref nvmlGpuFabricInfo_v2_t.healthMask
|
||||
* field to the end. This structure is not backwards-compatible with
|
||||
* \ref nvmlGpuFabricInfo_t.
|
||||
*/
|
||||
typedef struct {
|
||||
unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuFabricInfo_v2)
|
||||
unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs
|
||||
nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete".
|
||||
unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs
|
||||
nvmlGpuFabricState_t state; //!< Current state of GPU registration process
|
||||
unsigned int healthMask; //!< GPU Fabric health Status Mask
|
||||
} nvmlGpuFabricInfo_v2_t;
|
||||
|
||||
typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t;
|
||||
|
||||
/**
|
||||
* Version identifier value for \ref nvmlGpuFabricInfo_v2_t.version.
|
||||
*/
|
||||
#define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2)
|
||||
|
||||
/* End of nvml.h */
|
||||
#endif // NCCL_NVML_DIRECT
|
||||
|
||||
@@ -210,5 +282,6 @@ ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
|
||||
ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
|
||||
ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
|
||||
ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
|
||||
ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo);
|
||||
|
||||
#endif // End include guard
|
||||
|
||||
@@ -9,10 +9,22 @@
|
||||
#ifndef NCCL_P2P_H_
|
||||
#define NCCL_P2P_H_
|
||||
|
||||
#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
|
||||
#include <cuda.h>
|
||||
|
||||
typedef struct {
|
||||
#if CUDART_VERSION < 12030
|
||||
// MNNVL: FABRIC handle support lifted from CUDA 12.3
|
||||
#define CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED ((CUdevice_attribute)128)
|
||||
#define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL)
|
||||
#define CU_IPC_HANDLE_SIZE 64
|
||||
typedef struct CUmemFabricHandle_st {
|
||||
unsigned char data[CU_IPC_HANDLE_SIZE];
|
||||
} CUmemFabricHandle_v1;
|
||||
typedef CUmemFabricHandle_v1 CUmemFabricHandle;
|
||||
#endif
|
||||
|
||||
typedef union {
|
||||
uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support
|
||||
CUmemFabricHandle handle;
|
||||
} ncclCuDesc;
|
||||
|
||||
typedef union {
|
||||
|
||||
@@ -27,36 +27,43 @@ typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclP
|
||||
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
|
||||
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
|
||||
|
||||
union ncclProxyOpSpecifics {
|
||||
struct {
|
||||
size_t sizePerRank;
|
||||
int nNodes, node;
|
||||
} collnetDirect;
|
||||
};
|
||||
|
||||
struct ncclProxyOp {
|
||||
struct ncclProxyConnection* connection;
|
||||
int channelId;
|
||||
int nsteps;
|
||||
void* buffer;
|
||||
ssize_t nbytes;
|
||||
struct {
|
||||
int root:30;
|
||||
uint32_t connIndex:2;
|
||||
};
|
||||
int next;
|
||||
|
||||
uint64_t opCount;
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int root:30;
|
||||
uint32_t connIndex:2;
|
||||
int next;
|
||||
int nsteps;
|
||||
int chunkSize;
|
||||
uint8_t sliceSteps;
|
||||
uint8_t chunkSteps;
|
||||
uint8_t channelId;
|
||||
uint8_t /*ncclDataType_t*/ dtype;
|
||||
uint8_t /*ncclDevRedOp_t*/ redOp;
|
||||
uint8_t /*ncclFunc_t*/ coll;
|
||||
uint8_t /*ncclPattern_t*/ pattern;
|
||||
uint8_t protocol;
|
||||
uint8_t reg;
|
||||
|
||||
union {
|
||||
uint64_t unused;
|
||||
// For use by enqueue.cc
|
||||
struct ncclProxyOp *enqNext;
|
||||
};
|
||||
union ncclProxyOpSpecifics specifics;
|
||||
|
||||
struct ncclProxyOp *enqNext;
|
||||
};
|
||||
static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
|
||||
|
||||
struct ncclProxySubArgs {
|
||||
struct ncclProxyConnection* connection;
|
||||
int reg;
|
||||
void* buffer;
|
||||
void* mhandle;
|
||||
int channelId;
|
||||
int nsteps;
|
||||
ssize_t nbytes;
|
||||
@@ -93,6 +100,7 @@ struct ncclProxyArgs {
|
||||
uint8_t /*ncclDataType_t*/ dtype;
|
||||
uint8_t /*ncclDevRedOp_t*/ redOp;
|
||||
uint8_t /*ncclPattern_t*/ pattern;
|
||||
uint8_t /*ncclFunc_t*/ coll;
|
||||
uint8_t protocol;
|
||||
int state;
|
||||
char* sharedBuff[NCCL_STEPS];
|
||||
@@ -105,6 +113,8 @@ struct ncclProxyArgs {
|
||||
struct ncclProxyArgs* next;
|
||||
struct ncclProxyArgs* nextPeer;
|
||||
struct ncclProxyArgs** proxyAppendPtr;
|
||||
|
||||
union ncclProxyOpSpecifics specifics;
|
||||
};
|
||||
#define NCCL_MAX_NETDEVS 128
|
||||
|
||||
@@ -112,7 +122,7 @@ struct ncclProxyArgs {
|
||||
// Make sure we have enough to store two full rounds of operations on all channels.
|
||||
// Otherwise we'd be unable to post half of them to free new elements.
|
||||
#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P)
|
||||
#define NCCL_MAX_LOCAL_RANKS 64
|
||||
|
||||
struct ncclProxyOpsPool {
|
||||
struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
|
||||
volatile int nextOps;
|
||||
@@ -205,6 +215,16 @@ struct ncclProxyRpcResponseHeader {
|
||||
int respSize;
|
||||
};
|
||||
|
||||
// UDS support
|
||||
struct ncclIpcHdr {
|
||||
int type;
|
||||
int rank;
|
||||
int reqSize;
|
||||
int respSize;
|
||||
void *opId;
|
||||
uint64_t data[16]; // 128-bytes
|
||||
};
|
||||
|
||||
struct ncclProxyState {
|
||||
int refCount;
|
||||
int tpRank;
|
||||
@@ -220,9 +240,11 @@ struct ncclProxyState {
|
||||
ncclNet_t* ncclNet;
|
||||
ncclCollNet_t* ncclCollNet;
|
||||
volatile uint32_t* abortFlag;
|
||||
// Service thread
|
||||
// Service threads
|
||||
pthread_t thread;
|
||||
pthread_t threadUDS;
|
||||
struct ncclSocket* listenSock;
|
||||
struct ncclIpcSocket ipcSock;
|
||||
int stop;
|
||||
CUcontext cudaCtx;
|
||||
ncclResult_t asyncResult;
|
||||
@@ -233,6 +255,7 @@ struct ncclProxyState {
|
||||
struct ncclProxyOps* proxyOps;
|
||||
void** sharedDevMems;
|
||||
struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)
|
||||
uint64_t *peerAddressesUDS; // cuMem API support (UDS)
|
||||
|
||||
// Progress thread
|
||||
struct ncclProxyProgressState progressState;
|
||||
@@ -274,9 +297,9 @@ enum proxyMode {
|
||||
};
|
||||
|
||||
ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
|
||||
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
|
||||
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp, int reg);
|
||||
ncclResult_t ncclProxyStart(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
|
||||
enum ncclProxyMsgType {
|
||||
@@ -300,11 +323,12 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector
|
||||
ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
|
||||
ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
|
||||
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd);
|
||||
// UDS support
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int rank, void *handle, int* convertedFd);
|
||||
|
||||
ncclResult_t ncclProxyStop(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
|
||||
|
||||
ncclResult_t mscclSaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
|
||||
#endif
|
||||
#endif
|
||||
@@ -0,0 +1,42 @@
|
||||
#ifndef NCCL_REGISTER_H_
|
||||
#define NCCL_REGISTER_H_
|
||||
|
||||
enum {
|
||||
NET_REG_COMPLETE = 0x01,
|
||||
NVLS_REG_COMPLETE = 0x02,
|
||||
NVLS_REG_POSSIBLE = 0x04,
|
||||
NVLS_REG_NO_SUPPORT = 0x08
|
||||
};
|
||||
|
||||
struct ncclReg {
|
||||
// common attributes
|
||||
size_t pages;
|
||||
int refs;
|
||||
uintptr_t addr;
|
||||
uint32_t state;
|
||||
// net reg
|
||||
int nDevs;
|
||||
int devs[MAXCHANNELS];
|
||||
void** handles;
|
||||
// nvls reg
|
||||
uintptr_t baseAddr;
|
||||
size_t baseSize;
|
||||
CUdeviceptr regAddr;
|
||||
size_t regSize;
|
||||
int dev;
|
||||
CUmemGenericAllocationHandle mcHandle;
|
||||
uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
|
||||
};
|
||||
|
||||
struct ncclRegCache {
|
||||
struct ncclReg **slots;
|
||||
int capacity, population;
|
||||
uintptr_t pageSize;
|
||||
void* sComms[MAXCHANNELS];
|
||||
void* rComms[MAXCHANNELS];
|
||||
};
|
||||
|
||||
ncclResult_t ncclRegCleanup(struct ncclComm* comm);
|
||||
ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
|
||||
|
||||
#endif
|
||||
@@ -18,6 +18,7 @@ struct ncclShmemCollBuff {
|
||||
volatile size_t *cnt[2];
|
||||
volatile void *ptr[2];
|
||||
int round;
|
||||
size_t maxTypeSize;
|
||||
};
|
||||
|
||||
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
|
||||
|
||||
@@ -44,6 +44,8 @@ struct ncclPeerInfo {
|
||||
int64_t busId;
|
||||
struct ncclComm* comm;
|
||||
int cudaCompCap;
|
||||
// MNNVL support
|
||||
nvmlGpuFabricInfoV_t fabricInfo;
|
||||
};
|
||||
|
||||
#define CONNECT_SIZE 128
|
||||
|
||||
@@ -30,6 +30,11 @@ uint64_t getHostHash();
|
||||
uint64_t getPidHash();
|
||||
ncclResult_t getRandomData(void* buffer, size_t bytes);
|
||||
|
||||
const char* ncclOpToString(ncclRedOp_t op);
|
||||
const char* ncclDatatypeToString(ncclDataType_t type);
|
||||
const char* ncclAlgoToString(int algo);
|
||||
const char* ncclProtoToString(int proto);
|
||||
|
||||
struct netIf {
|
||||
char prefix[64];
|
||||
int port;
|
||||
@@ -394,6 +399,36 @@ void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
|
||||
}
|
||||
}
|
||||
|
||||
/* cmp function determines the sequence of objects in the queue. If cmp returns value >= 0, it means a > b,
|
||||
* and we should put a before b; otherwise, b should be put ahead of a. */
|
||||
template<typename T, T *T::*next>
|
||||
inline void ncclIntruQueueSortEnqueue(ncclIntruQueue<T,next> *me, T *x, int (*cmp)(T *a, T *b)) {
|
||||
T *cur = me->head;
|
||||
T *prev = NULL;
|
||||
|
||||
if (cur == NULL) {
|
||||
x->*next = nullptr;
|
||||
me->tail = me->head = x;
|
||||
} else {
|
||||
while (cur) {
|
||||
if (cmp(cur, x) > 0) {
|
||||
prev = cur;
|
||||
cur = cur->next;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
x->*next = cur;
|
||||
if (prev) {
|
||||
prev->*next = x;
|
||||
if (cur == NULL) me->tail = x;
|
||||
} else {
|
||||
me->head = x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() {
|
||||
|
||||
+142
-120
@@ -329,6 +329,10 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
* resource cleanup in commFree(). */
|
||||
if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
|
||||
pthread_join(comm->proxyState->thread, nullptr);
|
||||
if (comm->proxyState->threadUDS) {
|
||||
// UDS support
|
||||
pthread_join(comm->proxyState->threadUDS, nullptr);;
|
||||
}
|
||||
}
|
||||
|
||||
delete[] comm->userRedOps;
|
||||
@@ -417,17 +421,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
free(comm->topParentRanks);
|
||||
free(comm->topParentLocalRanks);
|
||||
|
||||
while (!ncclIntruQueueEmpty(&comm->regRecordQueue)) {
|
||||
struct ncclRegRecord* rec = ncclIntruQueueDequeue(&comm->regRecordQueue);
|
||||
NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize));
|
||||
free(rec->addrs);
|
||||
free(rec);
|
||||
}
|
||||
|
||||
while (!ncclIntruQueueEmpty(&comm->regRequestQueue)) {
|
||||
struct ncclRegRequest* req = ncclIntruQueueDequeue(&comm->regRequestQueue);
|
||||
free(req);
|
||||
}
|
||||
NCCLCHECK(ncclRegCleanup(comm));
|
||||
|
||||
commPoison(comm); // poison comm before free to avoid comm reuse.
|
||||
free(comm);
|
||||
@@ -472,7 +466,7 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
|
||||
/* comm must be ready, or error will be reported */
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
if (*comm->abortFlag) {
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
|
||||
ncclGroupJobAbort(comm->groupJob);
|
||||
} else {
|
||||
NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
|
||||
@@ -555,7 +549,6 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
|
||||
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
comm->channelSize = ncclParamAggChannelSize();
|
||||
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
|
||||
@@ -588,9 +581,9 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
comm->topParentRanks[i] = i;
|
||||
}
|
||||
|
||||
ncclIntruQueueConstruct(&comm->regRequestQueue);
|
||||
ncclIntruQueueConstruct(&comm->regRecordQueue);
|
||||
ncclIntruQueueMpscConstruct(&comm->callbackQueue);
|
||||
|
||||
comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -606,6 +599,8 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
comm->devComm = &devCommAndChans->comm;
|
||||
tmpCommAndChans.comm.rank = comm->rank;
|
||||
tmpCommAndChans.comm.nRanks = nRanks;
|
||||
tmpCommAndChans.comm.node = comm->node;
|
||||
tmpCommAndChans.comm.nNodes = comm->nNodes;
|
||||
tmpCommAndChans.comm.abortFlag = comm->abortFlag;
|
||||
for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
|
||||
tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
|
||||
@@ -638,6 +633,12 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
comm->workFifoSent = 0;
|
||||
comm->workFifoAckdMin = 0;
|
||||
|
||||
if (comm->collNetDenseToUserRank != nullptr) {
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
ncclCommPushCudaFree(comm, tmpCommAndChans.comm.collNetDenseToUserRank);
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
}
|
||||
|
||||
for (int c=0; c < MAXCHANNELS; c++) {
|
||||
tmpCommAndChans.channels[c].peers = comm->channels[c].devPeers;
|
||||
tmpCommAndChans.channels[c].ring = comm->channels[c].ring;
|
||||
@@ -731,6 +732,26 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
|
||||
|
||||
info->comm = comm;
|
||||
info->cudaCompCap = comm->minCompCap = comm->maxCompCap = comm->compCap;
|
||||
|
||||
#if !defined(__HIP_PLATFORM_HCC__) && !defined(__HCC__) && !defined(__HIPCC__)
|
||||
// MNNVL support
|
||||
{
|
||||
// MNNVL: Request the fabric UUID and partition info
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
nvmlDevice_t nvmlDev;
|
||||
NCCLCHECK(int64ToBusId(info->busId, busId));
|
||||
NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev));
|
||||
info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
|
||||
(void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo);
|
||||
if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
|
||||
INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x",
|
||||
info->busId,
|
||||
((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
|
||||
info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -774,8 +795,9 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
|
||||
comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
|
||||
}
|
||||
|
||||
if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
|
||||
else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
|
||||
// MNNVL support
|
||||
if (!comm->MNNVL && comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
|
||||
else if (comm->MNNVL || ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
|
||||
else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
|
||||
|
||||
// Make sure P2P chunksize is not larger than coll chunksize.
|
||||
@@ -805,6 +827,8 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
|
||||
int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_P2P };
|
||||
// Find all head ranks
|
||||
int nHeads = collNetGraph->nChannels;
|
||||
int nHeadsUnique = 0;
|
||||
int headsUnique[NCCL_MAX_LOCAL_RANKS];
|
||||
int highestTransportType0, highestTransportType1;
|
||||
char line[1024];
|
||||
bool share;
|
||||
@@ -816,13 +840,20 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
|
||||
struct collnetShareInfo* infos = NULL;
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&heads, nHeads), ret, fail);
|
||||
// Head GPU index is always 0
|
||||
for (int c = 0; c < nHeads; c++) {
|
||||
heads[c] = collNetGraph->intra[c * comm->localRanks + 0];
|
||||
{ uint64_t mask = 0;
|
||||
// Head GPU index is always 0
|
||||
for (int c = 0; c < nHeads; c++) {
|
||||
heads[c] = collNetGraph->intra[c * comm->localRanks + 0];
|
||||
assert(comm->rankToNode[heads[c]] == comm->node);
|
||||
uint64_t mask0 = mask;
|
||||
mask |= 1ull<<comm->rankToLocalRank[heads[c]];
|
||||
if (mask != mask0) headsUnique[nHeadsUnique++] = heads[c];
|
||||
}
|
||||
}
|
||||
|
||||
comm->collNetHeads = heads;
|
||||
comm->collNetHeadsNum = nHeads;
|
||||
comm->collNetHeadsUniqueNum = nHeadsUnique;
|
||||
if (parent && parent->collNetSupport && parent->config.splitShare && parent->nNodes == comm->nNodes) {
|
||||
NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail);
|
||||
/* check whether child can share collnet resources of parent. Since parent builds each collnet communicator
|
||||
@@ -883,6 +914,26 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
|
||||
NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1));
|
||||
comm->collNetChannels = comm->collNetSharedRes->nChannels = comm->nChannels;
|
||||
comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
|
||||
|
||||
comm->collNetDenseToUserRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
|
||||
comm->collNetUserToDenseRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
|
||||
{ // initialize collNetUserToDenseRank[rank]
|
||||
uint64_t nonHeadMask = (1ull<<comm->localRanks)-1;
|
||||
comm->collNetUserToDenseRank[rank] = -1;
|
||||
for (int h=0; h < nHeadsUnique; h++) {
|
||||
nonHeadMask ^= 1ull<<comm->rankToLocalRank[headsUnique[h]];
|
||||
if (headsUnique[h] == rank) { comm->collNetUserToDenseRank[rank] = h; break; }
|
||||
}
|
||||
if (comm->collNetUserToDenseRank[rank] == -1) {
|
||||
comm->collNetUserToDenseRank[rank] = __builtin_popcountll(nonHeadMask & ((1ull<<comm->localRank)-1));
|
||||
}
|
||||
comm->collNetUserToDenseRank[rank] += comm->node*comm->localRanks;
|
||||
}
|
||||
NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->collNetUserToDenseRank, sizeof(int)));
|
||||
for (int r=0; r < comm->nRanks; r++) {
|
||||
comm->collNetDenseToUserRank[comm->collNetUserToDenseRank[r]] = r;
|
||||
}
|
||||
|
||||
for (int c = 0; c < comm->collNetChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels + c;
|
||||
NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail);
|
||||
@@ -1000,6 +1051,9 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// MNNVL: Flag to indicate whether to enable Multi-Node NVLink
|
||||
NCCL_PARAM(MNNVL, "MNNVL", -2);
|
||||
|
||||
static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent = NULL) {
|
||||
// We use 2 AllGathers
|
||||
// 1. { peerInfo, comm, compCap}
|
||||
@@ -1007,6 +1061,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int rank = comm->rank;
|
||||
int nranks = comm->nRanks;
|
||||
int nNodes = 1;
|
||||
cpu_set_t affinitySave;
|
||||
struct ncclTopoGraph ringGraph;
|
||||
struct ncclTopoGraph treeGraph;
|
||||
@@ -1054,6 +1109,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
|
||||
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
|
||||
if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
|
||||
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
|
||||
ret = ncclInvalidUsage;
|
||||
@@ -1063,6 +1119,56 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
|
||||
// AllGather1 - end
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cudawrap.h"
|
||||
|
||||
// MNNVL support
|
||||
if (nNodes > 1) {
|
||||
int cliqueSize = 0;
|
||||
comm->MNNVL = 0;
|
||||
// Determine the size of the MNNVL domain/clique
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[rank].fabricInfo;
|
||||
nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
|
||||
// Check that the Fabric state is fully initialized
|
||||
if (fabricInfo2->state != NVML_GPU_FABRIC_STATE_COMPLETED) continue;
|
||||
// Check that the cluster UUID and cliqueId match in each rank
|
||||
// A zero UUID means we don't have MNNVL fabric info - disable MNNVL
|
||||
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) continue;
|
||||
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
|
||||
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
|
||||
cliqueSize++;
|
||||
}
|
||||
}
|
||||
// Determine whether this is a MNNVL system
|
||||
comm->MNNVL = ncclParamMNNVL() < 0 ? cliqueSize == comm->nRanks : ncclParamMNNVL();
|
||||
// MNNVL requires cuMem to be enabled
|
||||
if (!ncclCuMemEnable()) comm->MNNVL = 0;
|
||||
if (comm->MNNVL) {
|
||||
// MNNVL also requires FABRIC handle support
|
||||
int cudaDev;
|
||||
int flag = 0;
|
||||
CUdevice currentDev;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
// Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
|
||||
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
|
||||
if (!flag)
|
||||
comm->MNNVL = 0;
|
||||
else
|
||||
// Force the handle type to be FABRIC for MNNVL
|
||||
ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
}
|
||||
if (ncclParamMNNVL() == 1 && !comm->MNNVL) {
|
||||
WARN("MNNVL is not supported on this system");
|
||||
ret = ncclSystemError;
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
do {
|
||||
// Compute intra-process ranks
|
||||
int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
|
||||
@@ -1347,6 +1453,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
goto fail;
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
|
||||
comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
|
||||
|
||||
nChannelsOrig = comm->nChannels;
|
||||
NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
|
||||
int nc;
|
||||
@@ -1439,7 +1548,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
comm->topParentLocalRanks = topParentLocalRanks;
|
||||
|
||||
// Launch proxy service thread, after this, the proxy calls can be used.
|
||||
NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
|
||||
if (parent && parent->config.splitShare) {
|
||||
comm->proxyState = parent->sharedRes->proxyState;
|
||||
ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
|
||||
} else {
|
||||
NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
|
||||
}
|
||||
|
||||
// Connect with prev/next for each ring
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
@@ -1476,8 +1590,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
// Setup NVLS
|
||||
NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
|
||||
// And NVLS trees if needed
|
||||
if (comm->nvlsSupport && comm->localRanks > 1) {
|
||||
for (int c=0; c<comm->nvlsChannels; c++) {
|
||||
if (comm->nvlsSupport && comm->nNodes > 1) {
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail);
|
||||
@@ -1496,7 +1610,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
// Compute time models for algorithm and protocol combinations
|
||||
NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
|
||||
|
||||
INFO(NCCL_INIT, "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
||||
INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->collNetChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
||||
|
||||
do { // Setup p2p structures in comm->tasks
|
||||
struct ncclTasks* tasks = &comm->tasks;
|
||||
@@ -2281,7 +2395,7 @@ static ncclResult_t commReclaim(ncclComm_t comm) {
|
||||
|
||||
NCCLCHECKGOTO(ncclCommGetAsyncError(comm, &state), ret, fail);
|
||||
TRACE(NCCL_INIT, "commReclaim: reclaim comm %p rank %d state %d", comm, comm->rank, state);
|
||||
if (state == ncclSuccess && *comm->abortFlag == 0 && comm->finalizeCalled == false) {
|
||||
if (state == ncclSuccess && __atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0 && comm->finalizeCalled == false) {
|
||||
/* user does not call ncclCommFinalize and this is a normal comm destroy. ncclCommDestroy
|
||||
* should be nonblocking until last call of ncclCommDestroy. */
|
||||
NCCLCHECKGOTO(commFinalize(comm, false), ret, fail);
|
||||
@@ -2406,9 +2520,9 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
|
||||
// Ask anything that might still be running on the device to quit
|
||||
childAbortFlag = __atomic_load_n(&comm->childAbortFlag, __ATOMIC_ACQUIRE);
|
||||
if (childAbortFlag != NULL) {
|
||||
*childAbortFlag = 1;
|
||||
__atomic_store_n(childAbortFlag, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
*comm->abortFlag = 1;
|
||||
__atomic_store_n(comm->abortFlag, 1, __ATOMIC_RELAXED);
|
||||
/* init thread must be joined before we destroy the comm,
|
||||
* and we should ignore the init error here. */
|
||||
ncclCommEnsureReady(comm);
|
||||
@@ -2556,98 +2670,6 @@ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
#if CUDART_VERSION >= 12010
|
||||
size_t granularity;
|
||||
if (ncclParamLocalRegister()) {
|
||||
if (comm == NCCL_COMM_NULL || buff == NULL || handle == NULL || size == 0) {
|
||||
WARN("Invalid arguments comm %p, buff %p, size %ld, handle %p", comm, buff, size, handle);
|
||||
ret = ncclInvalidArgument;
|
||||
} else if (comm->nvlsSupport) {
|
||||
CUmulticastObjectProp prop = comm->nvlsResources->properties;
|
||||
|
||||
prop.size = size;
|
||||
CUCHECK(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
|
||||
|
||||
if ((uintptr_t)buff % comm->nvlsResources->ucGran == 0 && size % granularity == 0) {
|
||||
/* we can direct register what user provide */
|
||||
struct ncclRegRequest* req;
|
||||
NCCLCHECK(ncclCalloc(&req, 1));
|
||||
req->buff = (uintptr_t)buff;
|
||||
req->size = size;
|
||||
ncclIntruQueueEnqueue(&comm->regRequestQueue, req);
|
||||
*handle = (void*)req;
|
||||
} else {
|
||||
void* base;
|
||||
size_t baseSize;
|
||||
/* Since we don't provide actually allocated buffer size for users by ncclMemAlloc,
|
||||
* therefore, we need to get the full range of the buffer by cuMemGetAddressRange to
|
||||
* register buffers. */
|
||||
CUCHECK(cuMemGetAddressRange((CUdeviceptr*)&base, &baseSize, (CUdeviceptr)buff));
|
||||
if ((uintptr_t)base % comm->nvlsResources->ucGran == 0 && baseSize % granularity == 0) {
|
||||
struct ncclRegRequest* req;
|
||||
NCCLCHECK(ncclCalloc(&req, 1));
|
||||
req->buff = (uintptr_t)base;
|
||||
req->size = baseSize;
|
||||
ncclIntruQueueEnqueue(&comm->regRequestQueue, req);
|
||||
*handle = (void*)req;
|
||||
} else {
|
||||
WARN("register fails, buffer %p (aligned %s, granularity %ld) and size %ld (aligned %s, granularity %ld) for registration", buff, (uintptr_t)buff % comm->nvlsResources->ucGran == 0 ? "TRUE" : "FALSE", comm->nvlsResources->ucGran, size, size % granularity == 0 ? "TRUE" : "FALSE", granularity);
|
||||
ret = ncclInvalidArgument;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
|
||||
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
#if CUDART_VERSION >= 12010
|
||||
struct ncclRegRequest* dreq = (struct ncclRegRequest*)handle;
|
||||
if (ncclParamLocalRegister()) {
|
||||
if (comm == NCCL_COMM_NULL || handle == NULL) {
|
||||
WARN("Invalid arguments comm %p, handle %p", comm, handle);
|
||||
ret = ncclInvalidArgument;
|
||||
} else {
|
||||
struct ncclRegRecord* rec;
|
||||
|
||||
/* first release register record */
|
||||
rec = ncclIntruQueueHead(&comm->regRecordQueue);
|
||||
|
||||
while (rec) {
|
||||
if (rec->buff == dreq->buff && rec->size == dreq->size) {
|
||||
NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize));
|
||||
ncclIntruQueueDelete(&comm->regRecordQueue, rec);
|
||||
free(rec->addrs);
|
||||
free(rec);
|
||||
break;
|
||||
}
|
||||
rec = rec->next;
|
||||
}
|
||||
|
||||
/* then free register request */
|
||||
if (ncclIntruQueueDelete(&comm->regRequestQueue, dreq) == false) {
|
||||
WARN("Invalid handle %p", handle);
|
||||
ret = ncclInvalidArgument;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
|
||||
ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
@@ -2759,4 +2781,4 @@ exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
@@ -7,7 +7,7 @@
|
||||
#include "argcheck.h"
|
||||
#include "comm.h"
|
||||
|
||||
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
|
||||
ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
|
||||
cudaPointerAttributes attr;
|
||||
cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
|
||||
if (err != cudaSuccess || attr.devicePointer == NULL) {
|
||||
|
||||
@@ -14,6 +14,9 @@
|
||||
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
|
||||
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
|
||||
|
||||
// Handle type used for cuMemCreate()
|
||||
CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
|
||||
static int ncclCuMemSupported = 0;
|
||||
|
||||
// Determine whether CUMEM & VMM RDMA is supported on this platform
|
||||
|
||||
@@ -132,7 +132,7 @@ ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
|
||||
WARN("UDS: Receiving data over socket failed : %d", errno);
|
||||
return ncclSystemError;
|
||||
}
|
||||
if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
|
||||
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
|
||||
}
|
||||
|
||||
if (recvFd != NULL) {
|
||||
@@ -221,7 +221,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
|
||||
WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
|
||||
return ncclSystemError;
|
||||
}
|
||||
if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
|
||||
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -39,6 +39,8 @@ namespace {
|
||||
NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
|
||||
NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
|
||||
NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values))
|
||||
// MNNVL support
|
||||
NCCL_NVML_FN(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t, (nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo))
|
||||
|
||||
std::mutex lock; // NVML has had some thread safety bugs
|
||||
bool initialized = false;
|
||||
@@ -82,7 +84,9 @@ ncclResult_t ncclNvmlEnsureInitialized() {
|
||||
{(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
|
||||
{(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
|
||||
{(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"},
|
||||
{(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"}
|
||||
{(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"},
|
||||
// MNNVL support
|
||||
{(void**)&pfn_nvmlDeviceGetGpuFabricInfoV, "nvmlDeviceGetGpuFabricInfoV"},
|
||||
};
|
||||
for(Symbol sym: symbols) {
|
||||
*sym.ppfn = dlsym(libhandle, sym.name);
|
||||
@@ -269,3 +273,12 @@ ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount,
|
||||
NVMLTRY(nvmlDeviceGetFieldValues, device, valuesCount, values);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// MNNVL support
|
||||
ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo) {
|
||||
NCCLCHECK(ncclNvmlEnsureInitialized());
|
||||
std::lock_guard<std::mutex> locked(lock);
|
||||
gpuFabricInfo->version = nvmlGpuFabricInfo_v2;
|
||||
NVMLTRY(nvmlDeviceGetGpuFabricInfoV, device, gpuFabricInfo);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -169,7 +169,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
|
||||
int curRound = shmem->round;
|
||||
size_t mycnt;
|
||||
|
||||
if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL) {
|
||||
if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || shmem->maxTypeSize < typeSize) {
|
||||
ret = ncclInvalidArgument;
|
||||
goto exit;
|
||||
}
|
||||
@@ -184,7 +184,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
|
||||
uint64_t t0 = clockNano();
|
||||
while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) {
|
||||
if (clockNano() - t0 >= 5 * 1000) sched_yield();
|
||||
if (*comm->abortFlag == 1) {
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 1) {
|
||||
ret = ncclInternalError;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
|
||||
}
|
||||
}
|
||||
(*offset) += bytes;
|
||||
if (sock->abortFlag && *sock->abortFlag != 0) {
|
||||
if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) {
|
||||
INFO(NCCL_NET, "socketProgressOpt: abort called");
|
||||
return ncclInternalError;
|
||||
}
|
||||
@@ -531,6 +531,8 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
|
||||
sock->state = ncclSocketStateConnecting;
|
||||
} else if (ret != EINPROGRESS) {
|
||||
sock->state = ncclSocketStateError;
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
WARN("socketPollConnect: Connect to %s returned %d(%s) errno %d(%s)", ncclSocketToString(&sock->addr, line), ret, strerror(ret), errno, strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -618,12 +620,12 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
|
||||
do {
|
||||
NCCLCHECK(socketProgressState(sock));
|
||||
} while (sock->asyncFlag == 0 &&
|
||||
(sock->abortFlag == NULL || *sock->abortFlag == 0) &&
|
||||
(sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED) == 0) &&
|
||||
(sock->state == ncclSocketStateConnecting ||
|
||||
sock->state == ncclSocketStateConnectPolling ||
|
||||
sock->state == ncclSocketStateConnected));
|
||||
|
||||
if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
|
||||
if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
|
||||
|
||||
switch (sock->state) {
|
||||
case ncclSocketStateConnecting:
|
||||
@@ -665,11 +667,11 @@ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listen
|
||||
do {
|
||||
NCCLCHECKGOTO(socketProgressState(sock), ret, exit);
|
||||
} while (sock->asyncFlag == 0 &&
|
||||
(sock->abortFlag == NULL || *sock->abortFlag == 0) &&
|
||||
(sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED) == 0) &&
|
||||
(sock->state == ncclSocketStateAccepting ||
|
||||
sock->state == ncclSocketStateAccepted));
|
||||
|
||||
if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
|
||||
if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
|
||||
|
||||
switch (sock->state) {
|
||||
case ncclSocketStateAccepting:
|
||||
|
||||
@@ -30,25 +30,25 @@ ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner) {
|
||||
if (name) {
|
||||
INFO(NCCL_TUNING, "NCCL_TUNER_PLUGIN set to %s", name);
|
||||
tunerPluginLib = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
|
||||
}
|
||||
if (tunerPluginLib == nullptr) {
|
||||
// dlopen does not guarantee to set errno, but dlerror only gives us a
|
||||
// string, so checking errno doesn't hurt to try to provide a better
|
||||
// error message
|
||||
if (errno == ENOENT) {
|
||||
INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name);
|
||||
if (tunerPluginLib == nullptr) {
|
||||
// dlopen does not guarantee to set errno, but dlerror only gives us a
|
||||
// string, so checking errno doesn't hurt to try to provide a better
|
||||
// error message
|
||||
if (errno == ENOENT) {
|
||||
INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name);
|
||||
} else {
|
||||
INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror());
|
||||
}
|
||||
} else {
|
||||
INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror());
|
||||
}
|
||||
} else {
|
||||
tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
|
||||
if (tunerSymbol == nullptr) {
|
||||
INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name);
|
||||
dlclose(tunerPluginLib);
|
||||
tunerPluginLib = nullptr;
|
||||
} else {
|
||||
INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name);
|
||||
tunerPluginRefCount = 0;
|
||||
tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
|
||||
if (tunerSymbol == nullptr) {
|
||||
INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name);
|
||||
dlclose(tunerPluginLib);
|
||||
tunerPluginLib = nullptr;
|
||||
} else {
|
||||
INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name);
|
||||
tunerPluginRefCount = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -291,3 +291,79 @@ void ncclMemoryStackDestruct(struct ncclMemoryStack* me) {
|
||||
h = h1;
|
||||
}
|
||||
}
|
||||
|
||||
const char* ncclOpToString(ncclRedOp_t op) {
|
||||
switch (op) {
|
||||
case ncclSum:
|
||||
return "ncclSum";
|
||||
case ncclProd:
|
||||
return "ncclProd";
|
||||
case ncclMax:
|
||||
return "ncclMax";
|
||||
case ncclMin:
|
||||
return "ncclMin";
|
||||
case ncclAvg:
|
||||
return "ncclAvg";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char* ncclDatatypeToString(ncclDataType_t type) {
|
||||
switch (type) {
|
||||
case ncclInt8: // ncclChar
|
||||
return "ncclInt8";
|
||||
case ncclInt32: // ncclInt
|
||||
return "ncclInt32";
|
||||
case ncclUint32:
|
||||
return "ncclUint32";
|
||||
case ncclInt64:
|
||||
return "ncclInt64";
|
||||
case ncclUint64:
|
||||
return "ncclUint64";
|
||||
case ncclFloat16: // ncclHalf
|
||||
return "ncclFloat16";
|
||||
case ncclFloat32: // ncclFloat
|
||||
return "ncclFloat32";
|
||||
case ncclFloat64: // ncclDouble
|
||||
return "ncclFloat64";
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
case ncclBfloat16:
|
||||
return "ncclBfloat16";
|
||||
#endif
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char* ncclAlgoToString(int algo) {
|
||||
switch (algo) {
|
||||
case NCCL_ALGO_TREE:
|
||||
return "TREE";
|
||||
case NCCL_ALGO_RING:
|
||||
return "RING";
|
||||
case NCCL_ALGO_COLLNET_DIRECT:
|
||||
return "COLLNET_DIRECT";
|
||||
case NCCL_ALGO_COLLNET_CHAIN:
|
||||
return "COLLNET_CHAIN";
|
||||
case NCCL_ALGO_NVLS:
|
||||
return "NVLS";
|
||||
case NCCL_ALGO_NVLS_TREE:
|
||||
return "NVLS_TREE";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char* ncclProtoToString(int proto) {
|
||||
switch (proto) {
|
||||
case NCCL_PROTO_LL:
|
||||
return "LL";
|
||||
case NCCL_PROTO_LL128:
|
||||
return "LL128";
|
||||
case NCCL_PROTO_SIMPLE:
|
||||
return "SIMPLE";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -264,11 +264,7 @@ const char* ncclGetErrorString(ncclResult_t result);
|
||||
const char* pncclGetErrorString(ncclResult_t result);
|
||||
/*! @endcond */
|
||||
|
||||
/*! @brief Returns mesage on last result that occured.
|
||||
@details Returns a human-readable message of the last error that occurred.
|
||||
@return String containing the last result
|
||||
|
||||
@param[in] comm is currently unused and can be set to NULL */
|
||||
/* Returns a human-readable message of the last error that occurred. */
|
||||
const char* ncclGetLastError(ncclComm_t comm);
|
||||
/*! @cond include_hidden */
|
||||
const char* pncclGetLastError(ncclComm_t comm);
|
||||
@@ -324,6 +320,18 @@ ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
/*! @endcond */
|
||||
/*! @} */
|
||||
|
||||
/* Register CUDA buffer for zero-copy operation */
|
||||
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
/*! @cond include_hidden */
|
||||
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
/*! @endcond */
|
||||
|
||||
/* Deregister CUDA buffer */
|
||||
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
|
||||
/*! @cond include_hidden */
|
||||
ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
|
||||
/*! @endcond */
|
||||
|
||||
/*! @defgroup rccl_api_enumerations API Enumerations
|
||||
@details Enumerations used by collective communication calls
|
||||
@{ */
|
||||
@@ -812,16 +820,8 @@ ncclResult_t pncclGroupEnd();
|
||||
/*! @endcond */
|
||||
/*! @} */
|
||||
|
||||
/* Register CUDA buffer for zero-copy operation */
|
||||
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
|
||||
/* Deregister CUDA buffer */
|
||||
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
|
||||
ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // end extern "C"
|
||||
#endif
|
||||
|
||||
#endif // end include guard
|
||||
#endif // end include guard
|
||||
+267
-123
@@ -16,16 +16,67 @@
|
||||
//#include <sys/stat.h>
|
||||
//#include <unistd.h>
|
||||
|
||||
static ncclNet_v7_t ncclNet_v5_as_v7;
|
||||
static ncclNet_v7_t ncclNet_v6_as_v7;
|
||||
static ncclNet_v8_t ncclNet_v5_as_v8;
|
||||
static ncclNet_v8_t ncclNet_v6_as_v8;
|
||||
static ncclNet_v8_t ncclNet_v7_as_v8;
|
||||
static ncclNet_v5_t *ncclNet_v5;
|
||||
static ncclNet_v6_t *ncclNet_v6;
|
||||
static ncclCollNet_v7_t ncclCollNet_v5_as_v7;
|
||||
static ncclCollNet_v7_t ncclCollNet_v6_as_v7;
|
||||
static ncclNet_v7_t *ncclNet_v7;
|
||||
static ncclCollNet_v8_t ncclCollNet_v5_as_v8;
|
||||
static ncclCollNet_v8_t ncclCollNet_v6_as_v8;
|
||||
static ncclCollNet_v8_t ncclCollNet_v7_as_v8;
|
||||
static ncclCollNet_v5_t *ncclCollNet_v5;
|
||||
static ncclCollNet_v6_t *ncclCollNet_v6;
|
||||
static ncclCollNet_v7_t *ncclCollNet_v7;
|
||||
|
||||
static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
|
||||
static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
ncclNetProperties_v7_t p7;
|
||||
ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
props->name = p7.name;
|
||||
props->pciPath = p7.pciPath;
|
||||
props->guid = p7.guid;
|
||||
props->ptrSupport = p7.ptrSupport;
|
||||
props->regIsGlobal = 0;
|
||||
props->speed = p7.speed;
|
||||
props->port = p7.port;
|
||||
props->maxComms = p7.maxComms;
|
||||
props->maxRecvs = p7.maxRecvs;
|
||||
props->latency = p7.latency;
|
||||
props->netDeviceType = p7.netDeviceType;
|
||||
props->netDeviceVersion = p7.netDeviceVersion;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
if (size >= 1<<31) return ncclInternalError;
|
||||
return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v7_as_v8_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclNet_v7->init(logfn));
|
||||
ncclNet_v7_as_v8.name = ncclNet_v7->name;
|
||||
ncclNet_v7_as_v8.devices = ncclNet_v7->devices;
|
||||
ncclNet_v7_as_v8.getProperties = ncclNet_v7_as_v8_getProperties; // ncclNet_v5->getProperties;
|
||||
ncclNet_v7_as_v8.listen = ncclNet_v7->listen;
|
||||
ncclNet_v7_as_v8.connect = ncclNet_v7->connect;
|
||||
ncclNet_v7_as_v8.accept = ncclNet_v7->accept;
|
||||
ncclNet_v7_as_v8.regMr = ncclNet_v7_as_v8_regMr;
|
||||
ncclNet_v7_as_v8.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
|
||||
ncclNet_v7_as_v8.deregMr = ncclNet_v7->deregMr;
|
||||
ncclNet_v7_as_v8.isend = ncclNet_v7->isend;
|
||||
ncclNet_v7_as_v8.irecv = ncclNet_v7->irecv;
|
||||
ncclNet_v7_as_v8.iflush = ncclNet_v7->iflush;
|
||||
ncclNet_v7_as_v8.test = ncclNet_v7->test;
|
||||
ncclNet_v7_as_v8.closeSend = ncclNet_v7->closeSend;
|
||||
ncclNet_v7_as_v8.closeRecv = ncclNet_v7->closeRecv;
|
||||
ncclNet_v7_as_v8.closeListen = ncclNet_v7->closeListen;
|
||||
ncclNet_v7_as_v8.getDeviceMr = ncclNet_v7->getDeviceMr;
|
||||
ncclNet_v7_as_v8.irecvConsumed = ncclNet_v7->irecvConsumed;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
ncclNetProperties_v6_t p6;
|
||||
ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
@@ -33,6 +84,7 @@ static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7
|
||||
props->pciPath = p6.pciPath;
|
||||
props->guid = p6.guid;
|
||||
props->ptrSupport = p6.ptrSupport;
|
||||
props->regIsGlobal = 0;
|
||||
props->speed = p6.speed;
|
||||
props->port = p6.port;
|
||||
props->maxComms = p6.maxComms;
|
||||
@@ -43,38 +95,43 @@ static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v6_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
|
||||
static ncclResult_t ncclNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
if (size >= 1<<31) return ncclInternalError;
|
||||
return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v6_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
|
||||
return ncclNet_v6->connect(dev, handle, sendComm);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v6_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
|
||||
static ncclResult_t ncclNet_v6_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
|
||||
return ncclNet_v6->accept(listenComm, recvComm);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v6_as_v7_init(ncclDebugLogger_t logfn) {
|
||||
static ncclResult_t ncclNet_v6_as_v8_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclNet_v6->init(logfn));
|
||||
ncclNet_v6_as_v7.name = ncclNet_v6->name;
|
||||
ncclNet_v6_as_v7.devices = ncclNet_v6->devices;
|
||||
ncclNet_v6_as_v7.getProperties = ncclNet_v6_as_v7_getProperties; // ncclNet_v5->getProperties;
|
||||
ncclNet_v6_as_v7.listen = ncclNet_v6->listen;
|
||||
ncclNet_v6_as_v7.connect = ncclNet_v6_as_v7_connect;
|
||||
ncclNet_v6_as_v7.accept = ncclNet_v6_as_v7_accept;
|
||||
ncclNet_v6_as_v7.regMr = ncclNet_v6->regMr;
|
||||
ncclNet_v6_as_v7.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
|
||||
ncclNet_v6_as_v7.deregMr = ncclNet_v6->deregMr;
|
||||
ncclNet_v6_as_v7.isend = ncclNet_v6->isend;
|
||||
ncclNet_v6_as_v7.irecv = ncclNet_v6->irecv;
|
||||
ncclNet_v6_as_v7.iflush = ncclNet_v6->iflush;
|
||||
ncclNet_v6_as_v7.test = ncclNet_v6->test;
|
||||
ncclNet_v6_as_v7.closeSend = ncclNet_v6->closeSend;
|
||||
ncclNet_v6_as_v7.closeRecv = ncclNet_v6->closeRecv;
|
||||
ncclNet_v6_as_v7.closeListen = ncclNet_v6->closeListen;
|
||||
ncclNet_v6_as_v7.getDeviceMr = NULL;
|
||||
ncclNet_v6_as_v7.irecvConsumed = NULL;
|
||||
ncclNet_v6_as_v8.name = ncclNet_v6->name;
|
||||
ncclNet_v6_as_v8.devices = ncclNet_v6->devices;
|
||||
ncclNet_v6_as_v8.getProperties = ncclNet_v6_as_v8_getProperties; // ncclNet_v5->getProperties;
|
||||
ncclNet_v6_as_v8.listen = ncclNet_v6->listen;
|
||||
ncclNet_v6_as_v8.connect = ncclNet_v6_as_v8_connect;
|
||||
ncclNet_v6_as_v8.accept = ncclNet_v6_as_v8_accept;
|
||||
ncclNet_v6_as_v8.regMr = ncclNet_v6_as_v8_regMr;
|
||||
ncclNet_v6_as_v8.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
|
||||
ncclNet_v6_as_v8.deregMr = ncclNet_v6->deregMr;
|
||||
ncclNet_v6_as_v8.isend = ncclNet_v6->isend;
|
||||
ncclNet_v6_as_v8.irecv = ncclNet_v6->irecv;
|
||||
ncclNet_v6_as_v8.iflush = ncclNet_v6->iflush;
|
||||
ncclNet_v6_as_v8.test = ncclNet_v6->test;
|
||||
ncclNet_v6_as_v8.closeSend = ncclNet_v6->closeSend;
|
||||
ncclNet_v6_as_v8.closeRecv = ncclNet_v6->closeRecv;
|
||||
ncclNet_v6_as_v8.closeListen = ncclNet_v6->closeListen;
|
||||
ncclNet_v6_as_v8.getDeviceMr = NULL;
|
||||
ncclNet_v6_as_v8.irecvConsumed = NULL;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
|
||||
static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
ncclNetProperties_v6_t p6;
|
||||
ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
@@ -82,6 +139,7 @@ static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7
|
||||
props->pciPath = p6.pciPath;
|
||||
props->guid = p6.guid;
|
||||
props->ptrSupport = p6.ptrSupport;
|
||||
props->regIsGlobal = 0;
|
||||
props->speed = p6.speed;
|
||||
props->port = p6.port;
|
||||
props->maxComms = p6.maxComms;
|
||||
@@ -92,40 +150,45 @@ static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v5_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
|
||||
static ncclResult_t ncclNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
if (size >= 1<<31) return ncclInternalError;
|
||||
return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v5_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
|
||||
return ncclNet_v5->connect(dev, handle, sendComm);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v5_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
|
||||
static ncclResult_t ncclNet_v5_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
|
||||
return ncclNet_v5->accept(listenComm, recvComm);
|
||||
}
|
||||
|
||||
// We use a wrapper around the v5 init to copy over the struct contents
|
||||
// post-init since they may not be initialized before hand.
|
||||
static ncclResult_t ncclNet_v5_as_v7_init(ncclDebugLogger_t logfn) {
|
||||
static ncclResult_t ncclNet_v5_as_v8_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclNet_v5->init(logfn));
|
||||
ncclNet_v5_as_v7.name = ncclNet_v5->name;
|
||||
ncclNet_v5_as_v7.devices = ncclNet_v5->devices;
|
||||
ncclNet_v5_as_v7.getProperties = ncclNet_v5_as_v7_getProperties;
|
||||
ncclNet_v5_as_v7.listen = ncclNet_v5->listen;
|
||||
ncclNet_v5_as_v7.connect = ncclNet_v5_as_v7_connect;
|
||||
ncclNet_v5_as_v7.accept = ncclNet_v5_as_v7_accept;
|
||||
ncclNet_v5_as_v7.regMr = ncclNet_v5->regMr;
|
||||
ncclNet_v5_as_v7.regMrDmaBuf = NULL;
|
||||
ncclNet_v5_as_v7.deregMr = ncclNet_v5->deregMr;
|
||||
ncclNet_v5_as_v7.isend = ncclNet_v5->isend;
|
||||
ncclNet_v5_as_v7.irecv = ncclNet_v5->irecv;
|
||||
ncclNet_v5_as_v7.iflush = ncclNet_v5->iflush;
|
||||
ncclNet_v5_as_v7.test = ncclNet_v5->test;
|
||||
ncclNet_v5_as_v7.closeSend = ncclNet_v5->closeSend;
|
||||
ncclNet_v5_as_v7.closeRecv = ncclNet_v5->closeRecv;
|
||||
ncclNet_v5_as_v7.closeListen = ncclNet_v5->closeListen;
|
||||
ncclNet_v5_as_v7.getDeviceMr = NULL;
|
||||
ncclNet_v5_as_v7.irecvConsumed = NULL;
|
||||
ncclNet_v5_as_v8.name = ncclNet_v5->name;
|
||||
ncclNet_v5_as_v8.devices = ncclNet_v5->devices;
|
||||
ncclNet_v5_as_v8.getProperties = ncclNet_v5_as_v8_getProperties;
|
||||
ncclNet_v5_as_v8.listen = ncclNet_v5->listen;
|
||||
ncclNet_v5_as_v8.connect = ncclNet_v5_as_v8_connect;
|
||||
ncclNet_v5_as_v8.accept = ncclNet_v5_as_v8_accept;
|
||||
ncclNet_v5_as_v8.regMr = ncclNet_v5_as_v8_regMr;
|
||||
ncclNet_v5_as_v8.regMrDmaBuf = NULL;
|
||||
ncclNet_v5_as_v8.deregMr = ncclNet_v5->deregMr;
|
||||
ncclNet_v5_as_v8.isend = ncclNet_v5->isend;
|
||||
ncclNet_v5_as_v8.irecv = ncclNet_v5->irecv;
|
||||
ncclNet_v5_as_v8.iflush = ncclNet_v5->iflush;
|
||||
ncclNet_v5_as_v8.test = ncclNet_v5->test;
|
||||
ncclNet_v5_as_v8.closeSend = ncclNet_v5->closeSend;
|
||||
ncclNet_v5_as_v8.closeRecv = ncclNet_v5->closeRecv;
|
||||
ncclNet_v5_as_v8.closeListen = ncclNet_v5->closeListen;
|
||||
ncclNet_v5_as_v8.getDeviceMr = NULL;
|
||||
ncclNet_v5_as_v8.irecvConsumed = NULL;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
|
||||
static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
ncclNetProperties_v6_t p6;
|
||||
ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
@@ -133,6 +196,7 @@ static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetPropertie
|
||||
props->pciPath = p6.pciPath;
|
||||
props->guid = p6.guid;
|
||||
props->ptrSupport = p6.ptrSupport;
|
||||
props->regIsGlobal = 0;
|
||||
props->speed = p6.speed;
|
||||
props->port = p6.port;
|
||||
props->maxComms = p6.maxComms;
|
||||
@@ -143,28 +207,35 @@ static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetPropertie
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCollNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
if (size >= 1<<31) return ncclInternalError;
|
||||
return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle);
|
||||
}
|
||||
|
||||
// We use a wrapper around the v5 init to copy over the struct contents
|
||||
// post-init since they may not be initialized before hand.
|
||||
static ncclResult_t ncclCollNet_v5_as_v7_init(ncclDebugLogger_t logfn) {
|
||||
static ncclResult_t ncclCollNet_v5_as_v8_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclCollNet_v5->init(logfn));
|
||||
ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name;
|
||||
ncclCollNet_v5_as_v7.devices = ncclCollNet_v5->devices;
|
||||
ncclCollNet_v5_as_v7.getProperties = ncclCollNet_v5_as_v7_getProperties;
|
||||
ncclCollNet_v5_as_v7.listen = ncclCollNet_v5->listen;
|
||||
ncclCollNet_v5_as_v7.connect = ncclCollNet_v5->connect;
|
||||
ncclCollNet_v5_as_v7.reduceSupport = ncclCollNet_v5->reduceSupport;
|
||||
ncclCollNet_v5_as_v7.regMr = ncclCollNet_v5->regMr;
|
||||
ncclCollNet_v5_as_v7.regMrDmaBuf = NULL;
|
||||
ncclCollNet_v5_as_v7.deregMr = ncclCollNet_v5->deregMr;
|
||||
ncclCollNet_v5_as_v7.iallreduce = ncclCollNet_v5->iallreduce;
|
||||
ncclCollNet_v5_as_v7.iflush = ncclCollNet_v5->iflush;
|
||||
ncclCollNet_v5_as_v7.test = ncclCollNet_v5->test;
|
||||
ncclCollNet_v5_as_v7.closeColl = ncclCollNet_v5->closeColl;
|
||||
ncclCollNet_v5_as_v7.closeListen = ncclCollNet_v5->closeListen;
|
||||
ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
|
||||
ncclCollNet_v5_as_v8.devices = ncclCollNet_v5->devices;
|
||||
ncclCollNet_v5_as_v8.getProperties = ncclCollNet_v5_as_v8_getProperties;
|
||||
ncclCollNet_v5_as_v8.listen = ncclCollNet_v5->listen;
|
||||
ncclCollNet_v5_as_v8.connect = ncclCollNet_v5->connect;
|
||||
ncclCollNet_v5_as_v8.reduceSupport = ncclCollNet_v5->reduceSupport;
|
||||
ncclCollNet_v5_as_v8.regMr = ncclCollNet_v5_as_v8_regMr;
|
||||
ncclCollNet_v5_as_v8.regMrDmaBuf = NULL;
|
||||
ncclCollNet_v5_as_v8.deregMr = ncclCollNet_v5->deregMr;
|
||||
ncclCollNet_v5_as_v8.iallreduce = ncclCollNet_v5->iallreduce;
|
||||
ncclCollNet_v5_as_v8.iallgather = nullptr;
|
||||
ncclCollNet_v5_as_v8.ireducescatter = nullptr;
|
||||
ncclCollNet_v5_as_v8.iflush = ncclCollNet_v5->iflush;
|
||||
ncclCollNet_v5_as_v8.test = ncclCollNet_v5->test;
|
||||
ncclCollNet_v5_as_v8.closeColl = ncclCollNet_v5->closeColl;
|
||||
ncclCollNet_v5_as_v8.closeListen = ncclCollNet_v5->closeListen;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
|
||||
static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
ncclNetProperties_v6_t p6;
|
||||
ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
@@ -172,6 +243,7 @@ static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetPropertie
|
||||
props->pciPath = p6.pciPath;
|
||||
props->guid = p6.guid;
|
||||
props->ptrSupport = p6.ptrSupport;
|
||||
props->regIsGlobal = 0;
|
||||
props->speed = p6.speed;
|
||||
props->port = p6.port;
|
||||
props->maxComms = p6.maxComms;
|
||||
@@ -182,24 +254,78 @@ static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetPropertie
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// We use a wrapper around the v5 init to copy over the struct contents
|
||||
static ncclResult_t ncclCollNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
if (size >= 1<<31) return ncclInternalError;
|
||||
return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
|
||||
}
|
||||
|
||||
// We use a wrapper around the v6 init to copy over the struct contents
|
||||
// post-init since they may not be initialized before hand.
|
||||
static ncclResult_t ncclCollNet_v6_as_v7_init(ncclDebugLogger_t logfn) {
|
||||
static ncclResult_t ncclCollNet_v6_as_v8_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclCollNet_v6->init(logfn));
|
||||
ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name;
|
||||
ncclCollNet_v6_as_v7.devices = ncclCollNet_v6->devices;
|
||||
ncclCollNet_v6_as_v7.getProperties = ncclCollNet_v6_as_v7_getProperties;
|
||||
ncclCollNet_v6_as_v7.listen = ncclCollNet_v6->listen;
|
||||
ncclCollNet_v6_as_v7.connect = ncclCollNet_v6->connect;
|
||||
ncclCollNet_v6_as_v7.reduceSupport = ncclCollNet_v6->reduceSupport;
|
||||
ncclCollNet_v6_as_v7.regMr = ncclCollNet_v6->regMr;
|
||||
ncclCollNet_v6_as_v7.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
|
||||
ncclCollNet_v6_as_v7.deregMr = ncclCollNet_v6->deregMr;
|
||||
ncclCollNet_v6_as_v7.iallreduce = ncclCollNet_v6->iallreduce;
|
||||
ncclCollNet_v6_as_v7.iflush = ncclCollNet_v6->iflush;
|
||||
ncclCollNet_v6_as_v7.test = ncclCollNet_v6->test;
|
||||
ncclCollNet_v6_as_v7.closeColl = ncclCollNet_v6->closeColl;
|
||||
ncclCollNet_v6_as_v7.closeListen = ncclCollNet_v6->closeListen;
|
||||
ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
|
||||
ncclCollNet_v6_as_v8.devices = ncclCollNet_v6->devices;
|
||||
ncclCollNet_v6_as_v8.getProperties = ncclCollNet_v6_as_v8_getProperties;
|
||||
ncclCollNet_v6_as_v8.listen = ncclCollNet_v6->listen;
|
||||
ncclCollNet_v6_as_v8.connect = ncclCollNet_v6->connect;
|
||||
ncclCollNet_v6_as_v8.reduceSupport = ncclCollNet_v6->reduceSupport;
|
||||
ncclCollNet_v6_as_v8.regMr = ncclCollNet_v6_as_v8_regMr;
|
||||
ncclCollNet_v6_as_v8.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
|
||||
ncclCollNet_v6_as_v8.deregMr = ncclCollNet_v6->deregMr;
|
||||
ncclCollNet_v6_as_v8.iallreduce = ncclCollNet_v6->iallreduce;
|
||||
ncclCollNet_v6_as_v8.iallgather = nullptr;
|
||||
ncclCollNet_v6_as_v8.ireducescatter = nullptr;
|
||||
ncclCollNet_v6_as_v8.iflush = ncclCollNet_v6->iflush;
|
||||
ncclCollNet_v6_as_v8.test = ncclCollNet_v6->test;
|
||||
ncclCollNet_v6_as_v8.closeColl = ncclCollNet_v6->closeColl;
|
||||
ncclCollNet_v6_as_v8.closeListen = ncclCollNet_v6->closeListen;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
ncclNetProperties_v7_t p7;
|
||||
ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
props->name = p7.name;
|
||||
props->pciPath = p7.pciPath;
|
||||
props->guid = p7.guid;
|
||||
props->ptrSupport = p7.ptrSupport;
|
||||
props->regIsGlobal = 0;
|
||||
props->speed = p7.speed;
|
||||
props->port = p7.port;
|
||||
props->maxComms = p7.maxComms;
|
||||
props->maxRecvs = p7.maxRecvs;
|
||||
props->latency = p7.latency;
|
||||
props->netDeviceType = NCCL_NET_DEVICE_HOST;
|
||||
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCollNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
if (size >= 1<<31) return ncclInternalError;
|
||||
return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
|
||||
}
|
||||
|
||||
// We use a wrapper around the v7 init to copy over the struct contents
|
||||
// post-init since they may not be initialized before hand.
|
||||
static ncclResult_t ncclCollNet_v7_as_v8_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclCollNet_v7->init(logfn));
|
||||
ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
|
||||
ncclCollNet_v7_as_v8.devices = ncclCollNet_v7->devices;
|
||||
ncclCollNet_v7_as_v8.getProperties = ncclCollNet_v7_as_v8_getProperties;
|
||||
ncclCollNet_v7_as_v8.listen = ncclCollNet_v7->listen;
|
||||
ncclCollNet_v7_as_v8.connect = ncclCollNet_v7->connect;
|
||||
ncclCollNet_v7_as_v8.reduceSupport = ncclCollNet_v7->reduceSupport;
|
||||
ncclCollNet_v7_as_v8.regMr = ncclCollNet_v7_as_v8_regMr;
|
||||
ncclCollNet_v7_as_v8.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
|
||||
ncclCollNet_v7_as_v8.deregMr = ncclCollNet_v7->deregMr;
|
||||
ncclCollNet_v7_as_v8.iallreduce = ncclCollNet_v7->iallreduce;
|
||||
ncclCollNet_v7_as_v8.iallgather = nullptr;
|
||||
ncclCollNet_v7_as_v8.ireducescatter = nullptr;
|
||||
ncclCollNet_v7_as_v8.iflush = ncclCollNet_v7->iflush;
|
||||
ncclCollNet_v7_as_v8.test = ncclCollNet_v7->test;
|
||||
ncclCollNet_v7_as_v8.closeColl = ncclCollNet_v7->closeColl;
|
||||
ncclCollNet_v7_as_v8.closeListen = ncclCollNet_v7->closeListen;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -237,54 +363,72 @@ ncclResult_t ncclNetPluginInit() {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclNets[0] = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
|
||||
ncclNets[0] = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
|
||||
if (ncclNets[0] == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol.");
|
||||
// Try v6 plugin
|
||||
ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
|
||||
if (ncclNet_v6 == nullptr) {
|
||||
// Try v5 plugin
|
||||
ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
|
||||
if (ncclNet_v5 == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
|
||||
if (netPluginLib != nullptr) dlclose(netPluginLib);
|
||||
return ncclSuccess;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol.");
|
||||
// Try v7 plugin
|
||||
ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
|
||||
if (ncclNet_v7 == nullptr) {
|
||||
// Try v6 plugin
|
||||
ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
|
||||
if (ncclNet_v6 == nullptr) {
|
||||
// Try v5 plugin
|
||||
ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
|
||||
if (ncclNet_v5 == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
|
||||
if (netPluginLib != nullptr) dlclose(netPluginLib);
|
||||
return ncclSuccess;
|
||||
} else {
|
||||
ncclNets[0] = &ncclNet_v5_as_v8;
|
||||
ncclNet_v5_as_v8.init = ncclNet_v5_as_v8_init;
|
||||
// Set the name right away to allow for NCCL_NET=... to work
|
||||
ncclNet_v5_as_v8.name = ncclNet_v5->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
|
||||
}
|
||||
} else {
|
||||
ncclNets[0] = &ncclNet_v5_as_v7;
|
||||
ncclNet_v5_as_v7.init = ncclNet_v5_as_v7_init;
|
||||
ncclNets[0] = &ncclNet_v6_as_v8;
|
||||
ncclNet_v6_as_v8.init = ncclNet_v6_as_v8_init;
|
||||
// Set the name right away to allow for NCCL_NET=... to work
|
||||
ncclNet_v5_as_v7.name = ncclNet_v5->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
|
||||
ncclNet_v6_as_v8.name = ncclNet_v6->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
|
||||
}
|
||||
} else {
|
||||
ncclNets[0] = &ncclNet_v6_as_v7;
|
||||
ncclNet_v6_as_v7.init = ncclNet_v6_as_v7_init;
|
||||
ncclNets[0] = &ncclNet_v7_as_v8;
|
||||
ncclNet_v7_as_v8.init = ncclNet_v7_as_v8_init;
|
||||
// Set the name right away to allow for NCCL_NET=... to work
|
||||
ncclNet_v6_as_v7.name = ncclNet_v6->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
|
||||
ncclNet_v7_as_v8.name = ncclNet_v7->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for CollNet
|
||||
ncclCollNets[0] = (ncclCollNet_v7_t*) dlsym(netPluginLib, "ncclCollNetPlugin_v7");
|
||||
ncclCollNets[0] = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8");
|
||||
if (ncclCollNets[0] == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol.");
|
||||
ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
|
||||
if (ncclCollNet_v6 == nullptr) {
|
||||
ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
|
||||
if (ncclCollNet_v5 == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol.");
|
||||
ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7");
|
||||
if (ncclCollNet_v7 == nullptr) {
|
||||
ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
|
||||
if (ncclCollNet_v6 == nullptr) {
|
||||
ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
|
||||
if (ncclCollNet_v5 == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
|
||||
} else {
|
||||
ncclCollNets[0] = &ncclCollNet_v5_as_v8;
|
||||
ncclCollNet_v5_as_v8.init = ncclCollNet_v5_as_v8_init;
|
||||
ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
|
||||
}
|
||||
} else {
|
||||
ncclCollNets[0] = &ncclCollNet_v5_as_v7;
|
||||
ncclCollNet_v5_as_v7.init = ncclCollNet_v5_as_v7_init;
|
||||
ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
|
||||
ncclCollNets[0] = &ncclCollNet_v6_as_v8;
|
||||
ncclCollNet_v6_as_v8.init = ncclCollNet_v6_as_v8_init;
|
||||
ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v6)", ncclCollNets[0]->name);
|
||||
}
|
||||
} else {
|
||||
ncclCollNets[0] = &ncclCollNet_v6_as_v7;
|
||||
ncclCollNet_v6_as_v7.init = ncclCollNet_v6_as_v7_init;
|
||||
ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v6)", ncclCollNets[0]->name);
|
||||
ncclCollNets[0] = &ncclCollNet_v7_as_v8;
|
||||
ncclCollNet_v7_as_v8.init = ncclCollNet_v7_as_v8_init;
|
||||
ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v7)", ncclCollNets[0]->name);
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -330,6 +474,7 @@ static ncclResult_t netGetState(int i, enum ncclNetState* state) {
|
||||
}
|
||||
|
||||
static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
|
||||
pthread_mutex_lock(&netLock);
|
||||
if (ncclCollNetStates[i] == ncclNetStateInit) {
|
||||
int ndev;
|
||||
if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
|
||||
@@ -337,6 +482,7 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
|
||||
else ncclCollNetStates[i] = ncclNetStateEnabled;
|
||||
}
|
||||
*state = ncclCollNetStates[i];
|
||||
pthread_mutex_unlock(&netLock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -421,7 +567,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
|
||||
while (!connected) {
|
||||
|
||||
// If we're aborting now, skip to cleanup
|
||||
if (*comm->abortFlag) {
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
|
||||
goto cleanup2;
|
||||
}
|
||||
|
||||
@@ -458,11 +604,9 @@ cleanup1:
|
||||
}
|
||||
|
||||
int ncclNetVersion(struct ncclComm* comm) {
|
||||
if (comm->ncclNet == &ncclNet_v5_as_v7) {
|
||||
return 5;
|
||||
} else if (comm->ncclNet == &ncclNet_v6_as_v7) {
|
||||
return 6;
|
||||
} else {
|
||||
return 7;
|
||||
}
|
||||
return
|
||||
(comm->ncclNet == &ncclNet_v5_as_v8) ? 5 :
|
||||
(comm->ncclNet == &ncclNet_v6_as_v8) ? 6 :
|
||||
(comm->ncclNet == &ncclNet_v7_as_v8) ? 7 :
|
||||
8;
|
||||
}
|
||||
|
||||
@@ -353,20 +353,22 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
|
||||
WARN("Proxy append out of bounds");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
//memset(sub, 0, sizeof(struct ncclProxySubArgs));
|
||||
sub->connection = op->connection;
|
||||
sub->channelId = op->channelId;
|
||||
sub->nsteps = op->nsteps;
|
||||
sub->nbytes = op->nbytes;
|
||||
sub->peer = op->root;
|
||||
sub->reg = op->reg;
|
||||
sub->buffer = op->buffer;
|
||||
args->nsubs = subIndex+1;
|
||||
if (subIndex) {
|
||||
if ((args->sliceSteps != op->sliceSteps) ||
|
||||
(args->chunkSteps != op->chunkSteps) ||
|
||||
(args->protocol != op->protocol) ||
|
||||
(args->dtype != op->dtype) ||
|
||||
(args->redOp != op->redOp)) {
|
||||
(args->redOp != op->redOp) ||
|
||||
(args->coll != op->coll)) {
|
||||
WARN("Proxy append mismatch");
|
||||
return ncclInternalError;
|
||||
}
|
||||
@@ -386,6 +388,8 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
|
||||
args->redOp = op->redOp;
|
||||
args->pattern = op->pattern;
|
||||
args->protocol = op->protocol;
|
||||
args->coll = op->coll;
|
||||
args->specifics = op->specifics;
|
||||
args->state = ncclProxyOpReady;
|
||||
args->progress = op->connection->tcomm->proxyProgress;
|
||||
args->proxyAppendPtr = op->connection->proxyAppendPtr;
|
||||
@@ -595,7 +599,7 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
|
||||
|
||||
NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
|
||||
|
||||
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) {
|
||||
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op, int reg) {
|
||||
memset(op, 0, sizeof(struct ncclProxyOp));
|
||||
int channelId = info->channelId;
|
||||
struct ncclChannel* channel = info->comm->channels+channelId;
|
||||
@@ -616,15 +620,17 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
|
||||
op->pattern = ncclPatternSend;
|
||||
if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
|
||||
// Tune chunk size for the network
|
||||
if (info->count < stepSize) info->chunkSize /= 4;
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE && info->count < stepSize) info->chunkSize /= 4;
|
||||
else if (info->count < 8*stepSize) info->chunkSize /= 2;
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE && peer->send[1].proxyConn.sameProcess) op->reg = reg;
|
||||
}
|
||||
} else if (info->coll == ncclFuncRecv) {
|
||||
op->pattern = ncclPatternRecv;
|
||||
if (op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) {
|
||||
// Tune chunk size for the network
|
||||
if (info->count < stepSize) info->chunkSize /= 4;
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE && info->count < stepSize) info->chunkSize /= 4;
|
||||
else if (info->count < 8*stepSize) info->chunkSize /= 2;
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE && peer->recv[1].proxyConn.sameProcess) op->reg = reg;
|
||||
}
|
||||
} else {
|
||||
WARN("P2p operation is neither send or recv");
|
||||
@@ -633,17 +639,21 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
|
||||
if (ncclParamChunkSize() != 0) {
|
||||
info->chunkSize = ncclParamChunkSize();
|
||||
}
|
||||
op->buffer = op->reg ? info->recvbuff : NULL;
|
||||
op->chunkSize = info->chunkSize;
|
||||
op->nbytes = info->count;
|
||||
|
||||
// Compute nSteps for proxies
|
||||
int chunkEffectiveSize = op->chunkSize;
|
||||
if (op->protocol == NCCL_PROTO_LL) {
|
||||
chunkEffectiveSize /= 2;
|
||||
op->nbytes *= 2;
|
||||
op->nbytes = DIVUP(op->nbytes, sizeof(union ncclLLFifoLine)) * sizeof(union ncclLLFifoLine);
|
||||
}
|
||||
|
||||
op->nbytes = stepSize;
|
||||
if (!op->reg) op->nbytes = std::min(op->nbytes, (ssize_t)info->chunkSize);
|
||||
op->nsteps = DIVUP(info->count, chunkEffectiveSize);
|
||||
if (op->nsteps == 0) op->nsteps = 1;
|
||||
if (op->nsteps == 0 || op->reg) op->nsteps = 1;
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1074,35 +1084,60 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// cuMem API support
|
||||
// The response is sent out-of-band using ncclIpcSocket for this specific command
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
ncclResult_t res = ncclInProgress;
|
||||
// UDS support
|
||||
ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, int tpRank, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int *respFd) {
|
||||
ncclResult_t res = ncclSuccess;
|
||||
struct ncclIpcSocket ipcSock = { 0 };
|
||||
void *opId = (void*)((((uintptr_t)random()) << 32) | random());
|
||||
|
||||
// Create a UDS socket to receive the converted fd
|
||||
NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag));
|
||||
int rank = comm->topParentLocalRanks[comm->localRank];
|
||||
struct ncclProxyState* sharedProxyState = comm->proxyState;
|
||||
uint64_t pidHash = sharedProxyState->peerAddressesUDS[tpRank];
|
||||
|
||||
// Request the allocation of a UDS fd for the handle over sockets
|
||||
NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), 0, opId), ret, error);
|
||||
INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %p opId %p",
|
||||
comm, rank, tpRank, pidHash, reqSize, respSize, respFd, opId);
|
||||
|
||||
// Receive the converted fd over UDS
|
||||
NCCLCHECKGOTO(ncclIpcSocketRecvFd(&ipcSock, convertedFd), ret, error);
|
||||
TRACE(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx rank %d returned fd %d", *(uint64_t*)handle, proxyConn->tpLocalRank, *convertedFd);
|
||||
NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), ret, error);
|
||||
// cuMem: Create a UDS socket to receive the response
|
||||
NCCLCHECK(ncclIpcSocketInit(&ipcSock, rank, (uint64_t)opId, comm->abortFlag));
|
||||
|
||||
// Wait for proxy response (sockets)
|
||||
while (res == ncclInProgress) {
|
||||
res = ncclPollProxyResponse(comm, proxyConn, NULL, opId);
|
||||
}
|
||||
ncclIpcHdr hdr;
|
||||
hdr.type = type;
|
||||
hdr.rank = rank;
|
||||
hdr.reqSize = reqSize;
|
||||
hdr.respSize = respSize;
|
||||
hdr.opId = opId;
|
||||
assert(reqSize <= sizeof(hdr.data));
|
||||
memcpy(&hdr.data, reqBuff, reqSize);
|
||||
NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), -1, tpRank, pidHash), res, error);
|
||||
NCCLCHECKGOTO(ncclIpcSocketRecvMsg(&ipcSock, respBuff, respSize, respFd), res, error);
|
||||
NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), res, error);
|
||||
|
||||
INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %d opId %p - DONE",
|
||||
comm, rank, tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId);
|
||||
|
||||
return res;
|
||||
|
||||
error:
|
||||
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
|
||||
WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", tpRank, pidHash, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
// cuMem API support
|
||||
// The request/response is sent out-of-band using ncclIpcSocket for this specific command
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int tpRank, void *handle, int* convertedFd) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// Request the allocation of a UDS fd for the handle
|
||||
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, tpRank, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, convertedFd), ret, error);
|
||||
|
||||
// We have now received the converted fd over UDS
|
||||
INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d", *(uint64_t*)handle, tpRank, *convertedFd);
|
||||
|
||||
return ret;
|
||||
|
||||
error:
|
||||
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
|
||||
WARN("ncclProxyClientGetFd call to rank %d handle 0x%lx failed : %d", proxyConn->tpRank, *(uint64_t*)handle, ret);
|
||||
WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", tpRank, *(uint64_t*)handle, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1137,7 +1172,7 @@ error:
|
||||
ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) {
|
||||
struct ncclProxyState* sharedProxyState = comm->proxyState;
|
||||
// Receive the connection pointer from the Proxy
|
||||
if (*comm->abortFlag) {
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
|
||||
WARN("Comm %p is in abort state", comm);
|
||||
return ncclInternalError;
|
||||
}
|
||||
@@ -1292,13 +1327,13 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
|
||||
}
|
||||
|
||||
// cuMem API support
|
||||
static ncclResult_t proxyGetFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, uint64_t handle) {
|
||||
static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void *opId, uint64_t handle) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
// cuMem API support
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclIpcSocket ipcSock = { 0 };
|
||||
uint64_t hash = (uint64_t) opId;
|
||||
INFO(NCCL_PROXY, "UDS proxyGetFd received handle 0x%lx peer %d opId %lx", handle, peer->tpLocalRank, hash);
|
||||
INFO(NCCL_PROXY, "UDS proxyGetFd received handle 0x%lx peer %d opId %lx", handle, rank, hash);
|
||||
|
||||
CUmemAllocationHandleType type = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
int fd = -1;
|
||||
@@ -1306,7 +1341,7 @@ static ncclResult_t proxyGetFd(struct ncclProxyLocalPeer* peer, void *opId, stru
|
||||
CUCHECK(cuMemExportToShareableHandle(&fd, handle, type, 0));
|
||||
// Send back the converted fd using UDS
|
||||
NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, error);
|
||||
NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash), ret, error);
|
||||
NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, rank, hash), ret, error);
|
||||
error:
|
||||
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
|
||||
// We can now safely close the exported fd
|
||||
@@ -1331,11 +1366,8 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
|
||||
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels);
|
||||
if (op->connection->tcomm->proxySharedInit) res = op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels);
|
||||
__atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE);
|
||||
} else if (op->type == ncclProxyMsgGetFd) {
|
||||
uint64_t handle = *(uint64_t*)op->reqBuff;
|
||||
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgGetFd opId=%p op.reqBuff=%p handle=0x%lx", op->opId, op->reqBuff, handle);
|
||||
res = proxyGetFd(peer, op->opId, proxyState, handle); // cuMem API support
|
||||
} else if (op->type == ncclProxyMsgInit) {
|
||||
}
|
||||
else if (op->type == ncclProxyMsgInit) {
|
||||
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
|
||||
res = proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection);
|
||||
} else return ncclInternalError;
|
||||
@@ -1365,7 +1397,7 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
|
||||
(*asyncOpCount)--;
|
||||
return ncclSuccess;
|
||||
|
||||
} else if (*proxyState->abortFlag != 0) {
|
||||
} else if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED) != 0) {
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
@@ -1451,7 +1483,7 @@ void* ncclProxyService(void* _args) {
|
||||
/* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
|
||||
* connections. Need to wait until all other related comms call abort and safely exit
|
||||
* together, or we could face segmentation fault. */
|
||||
if (*proxyState->abortFlag != 0) stop = 1;
|
||||
if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED) != 0) stop = 1;
|
||||
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
|
||||
int ret;
|
||||
do {
|
||||
@@ -1568,13 +1600,74 @@ void* ncclProxyService(void* _args) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) {
|
||||
|
||||
// Process a request on the UDS socket
|
||||
static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd) {
|
||||
ncclIpcHdr hdr;
|
||||
NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), NULL));
|
||||
if (hdr.type == ncclProxyMsgGetFd) {
|
||||
// cuMem API support
|
||||
uint64_t handle = *(uint64_t*)hdr.data;
|
||||
INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle);
|
||||
return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle);
|
||||
}
|
||||
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
// UDS fd handle support
|
||||
void* ncclProxyServiceUDS(void* _args) {
|
||||
struct ncclProxyState* proxyState = (struct ncclProxyState*) _args;
|
||||
struct pollfd pollfds[1];
|
||||
|
||||
if (setProxyThreadContext(proxyState)) {
|
||||
INFO(NCCL_INIT, "[Proxy Service UDS] Created CUDA context on device %d", proxyState->cudaDev);
|
||||
} else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
|
||||
WARN("[Proxy Service UDS] Failed to set CUDA device %d", proxyState->cudaDev);
|
||||
}
|
||||
|
||||
if (ncclIpcSocketGetFd(&proxyState->ipcSock, &pollfds[0].fd) != ncclSuccess) {
|
||||
WARN("[Proxy Service UDS] Get listenSock fd fails");
|
||||
return NULL;
|
||||
};
|
||||
pollfds[0].events = POLLIN|POLLHUP;
|
||||
|
||||
while (1) {
|
||||
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
|
||||
int ret;
|
||||
do {
|
||||
ret = poll(pollfds, 1, 500);
|
||||
} while (ret < 0 && errno == EINTR);
|
||||
if (ret < 0) {
|
||||
WARN("[Proxy Service UDS] Poll failed: %s", strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Check for stop/abort
|
||||
if (proxyState->stop || *proxyState->abortFlag) break;
|
||||
|
||||
if (pollfds[0].revents) {
|
||||
// A request was seen on the UDS fd
|
||||
proxyUDSRecvReq(proxyState, pollfds[0].fd);
|
||||
}
|
||||
}
|
||||
|
||||
ncclIpcSocketClose(&proxyState->ipcSock);
|
||||
INFO(NCCL_PROXY, "[Proxy Service UDS] exit: stop %d abortFlag %d", proxyState->stop, *proxyState->abortFlag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS) {
|
||||
assert(comm->sharedRes->proxyState == NULL);
|
||||
NCCLCHECK(ncclCalloc(&comm->sharedRes->proxyState, 1));
|
||||
comm->proxyState = comm->sharedRes->proxyState;
|
||||
comm->proxyState->refCount = 1;
|
||||
comm->proxyState->listenSock = sock;
|
||||
comm->proxyState->peerAddresses = peerAddresses;
|
||||
comm->proxyState->peerAddressesUDS = peerAddressesUDS;
|
||||
|
||||
// UDS support
|
||||
NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, peerAddressesUDS[comm->rank], comm->abortFlag));
|
||||
// Seed the random number generator for UDS filename generation
|
||||
struct timeval time;
|
||||
gettimeofday(&time,NULL);
|
||||
@@ -1606,6 +1699,11 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
|
||||
|
||||
pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState);
|
||||
ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev);
|
||||
|
||||
// UDS support
|
||||
INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d", comm, comm->rank);
|
||||
pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState);
|
||||
ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1615,8 +1713,13 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
|
||||
struct ncclProxyState* sharedProxyState = comm->proxyState;
|
||||
|
||||
if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
|
||||
if (comm->proxyState->threadUDS) {
|
||||
// UDS support
|
||||
comm->proxyState->stop = 1;
|
||||
}
|
||||
|
||||
if (sharedProxyState->peerAddresses) {
|
||||
if (*comm->abortFlag == 0) {
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0) {
|
||||
struct ncclSocket sock;
|
||||
int type = ncclProxyMsgStop;
|
||||
NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
|
||||
@@ -1641,7 +1744,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
|
||||
}
|
||||
}
|
||||
int type = ncclProxyMsgClose;
|
||||
if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)));
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0) NCCLCHECK(ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)));
|
||||
NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i));
|
||||
}
|
||||
}
|
||||
@@ -1657,6 +1760,7 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
|
||||
|
||||
assert(sharedProxyState->refCount == 0);
|
||||
free(sharedProxyState->peerAddresses);
|
||||
free(sharedProxyState->peerAddressesUDS);
|
||||
free(sharedProxyState->peerSocks);
|
||||
free(sharedProxyState->proxyOps);
|
||||
free(sharedProxyState->sharedDevMems);
|
||||
|
||||
@@ -0,0 +1,182 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "argcheck.h" // Need some checks here since we access comm
|
||||
#include "nccl.h"
|
||||
#include "comm.h"
|
||||
#include "net.h"
|
||||
#include "register.h"
|
||||
|
||||
ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
ncclDebugNoWarn = NCCL_NET;
|
||||
for (int d=0; d<reg->nDevs; d++) {
|
||||
if (reg->handles[d] != NULL) NCCLCHECK(comm->ncclNet->deregMr(cache->sComms[reg->devs[d]], reg->handles[d]));
|
||||
}
|
||||
reg->nDevs = 0;
|
||||
free(reg->handles);
|
||||
reg->handles = NULL;
|
||||
ncclDebugNoWarn = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
int netCount;
|
||||
NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
|
||||
if (netCount == 0) return ncclSuccess;
|
||||
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// Find local devices for p2p operations
|
||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
int dev;
|
||||
if (ncclTopoGetLocalNet(comm->topo, comm->rank, c, &dev) != ncclSuccess) goto end; // No local net
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECKGOTO(comm->ncclNet->getProperties(dev, &props), ret, end);
|
||||
if (props.regIsGlobal == 0) { // We need to be sure all NICs support global registration.
|
||||
reg->nDevs = 0;
|
||||
break;
|
||||
}
|
||||
int found = 0;
|
||||
for (int d=0; d<reg->nDevs; d++) if (reg->devs[d] == dev) found = 1;
|
||||
if (!found) reg->devs[reg->nDevs++] = dev;
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(®->handles, reg->nDevs), ret, end);
|
||||
|
||||
ncclDebugNoWarn = NCCL_NET;
|
||||
for (int d=0; d<reg->nDevs; d++) {
|
||||
int dev = reg->devs[d];
|
||||
reg->handles[d] = NULL;
|
||||
|
||||
if (cache->sComms[dev] == NULL) {
|
||||
// Create a loopback network comm object for that device to register the buffers.
|
||||
void *lComm = NULL;
|
||||
ncclNetHandle_t netHandle;
|
||||
bool connected = false;
|
||||
NCCLCHECKGOTO(comm->ncclNet->listen(dev, &netHandle, &lComm), ret, end);
|
||||
while (!connected) {
|
||||
if (*comm->abortFlag) {
|
||||
goto end;
|
||||
}
|
||||
if (cache->sComms[dev] == NULL)
|
||||
NCCLCHECKGOTO(comm->ncclNet->connect(dev, &netHandle, cache->sComms+dev, NULL), ret, end);
|
||||
if (cache->rComms[dev] == NULL)
|
||||
NCCLCHECKGOTO(comm->ncclNet->accept(lComm, cache->rComms+dev, NULL), ret, end);
|
||||
connected = (cache->rComms[dev] != NULL) && (cache->sComms[dev] != NULL);
|
||||
}
|
||||
NCCLCHECK(comm->ncclNet->closeListen(lComm));
|
||||
}
|
||||
if (comm->ncclNet->regMr(cache->sComms[dev], addr, size, NCCL_PTR_CUDA, reg->handles+d) != ncclSuccess) {
|
||||
reg->handles[d] = NULL;
|
||||
NCCLCHECK(ncclNetDeregister(comm, reg));
|
||||
reg->nDevs = 0;
|
||||
goto end;
|
||||
}
|
||||
}
|
||||
end:
|
||||
ncclDebugNoWarn = 0;
|
||||
if (ret != ncclSuccess) NCCLCHECK(ncclNetDeregister(comm, reg));
|
||||
return ret;
|
||||
}
|
||||
|
||||
ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
uintptr_t pageSize = cache->pageSize;
|
||||
uintptr_t addr = (uintptr_t)data & -pageSize;
|
||||
size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
|
||||
|
||||
*reg = NULL;
|
||||
for (int slot=0; /*true*/; slot++) {
|
||||
if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
|
||||
if ((addr >= cache->slots[slot]->addr) &&
|
||||
((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
|
||||
*reg = cache->slots[slot];
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
|
||||
|
||||
ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) {
|
||||
if (!ncclParamLocalRegister()) return ncclSuccess;
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
uintptr_t pageSize = cache->pageSize;
|
||||
uintptr_t addr = (uintptr_t)data & -pageSize;
|
||||
size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
|
||||
for (int slot=0; /*true*/; slot++) {
|
||||
if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
|
||||
if (cache->population == cache->capacity) { // must grow cache
|
||||
cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
|
||||
NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
|
||||
}
|
||||
memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
|
||||
NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
|
||||
struct ncclReg* regSlot = cache->slots[slot];
|
||||
regSlot->addr = addr;
|
||||
regSlot->pages = pages;
|
||||
regSlot->refs = 1;
|
||||
NCCLCHECK(ncclNetRegister(comm, (void*)addr, pages*pageSize, regSlot));
|
||||
regSlot->state |= NET_REG_COMPLETE;
|
||||
cache->population += 1;
|
||||
*handle = regSlot;
|
||||
return ncclSuccess;
|
||||
} else if ((addr >= cache->slots[slot]->addr) &&
|
||||
((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
|
||||
cache->slots[slot]->refs++;
|
||||
*handle = cache->slots[slot];
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
for (int i=0; i<cache->population; i++) {
|
||||
INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)cache->slots[i]->addr, cache->slots[i]->pages);
|
||||
NCCLCHECK(ncclNetDeregister(comm, cache->slots[i]));
|
||||
if (cache->slots[i]->state & NVLS_REG_COMPLETE) NCCLCHECK(ncclNvlsDeregBuffer(&cache->slots[i]->mcHandle, cache->slots[i]->regAddr, cache->slots[i]->dev, cache->slots[i]->regSize));
|
||||
free(cache->slots[i]);
|
||||
}
|
||||
free(cache->slots);
|
||||
for (int d=0; d<MAXCHANNELS; d++) {
|
||||
if (cache->sComms[d]) NCCLCHECK(comm->ncclNet->closeSend(cache->sComms[d]));
|
||||
if (cache->rComms[d]) NCCLCHECK(comm->ncclNet->closeRecv(cache->rComms[d]));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
|
||||
if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
|
||||
NCCLCHECK(ncclRegister(comm, buff, size, handle));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
|
||||
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
|
||||
struct ncclReg* reg = (struct ncclReg*)handle;
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
int slot;
|
||||
for (slot=0; slot<cache->population && cache->slots[slot] != reg; slot++);
|
||||
if (slot == cache->population) {
|
||||
WARN("Deregister: Could not find handle");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
if (--reg->refs) return ncclSuccess;
|
||||
NCCLCHECK(ncclNetDeregister(comm, reg));
|
||||
if (reg->state & NVLS_REG_COMPLETE) {
|
||||
NCCLCHECK(ncclNvlsDeregBuffer(®->mcHandle, reg->regAddr, reg->dev, reg->regSize));
|
||||
reg->regAddr = (CUdeviceptr)NULL;
|
||||
}
|
||||
free(reg);
|
||||
memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
|
||||
cache->population -= 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -341,10 +341,10 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
for (int r = 0; r < nranks; r++) {
|
||||
if (allConnects[r].isMaster) {
|
||||
memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect));
|
||||
if (r == rank) rankInCollNet = c;
|
||||
c++;
|
||||
}
|
||||
}
|
||||
if (isMaster) rankInCollNet = comm->node;
|
||||
} else { // send side : copy in connect info received from peer recv master
|
||||
if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
|
||||
}
|
||||
|
||||
@@ -87,8 +87,8 @@ struct connectMap {
|
||||
};
|
||||
|
||||
struct reqSlot {
|
||||
volatile void* recvBuff;
|
||||
volatile int size;
|
||||
bool turnIsSendNotRecv;
|
||||
int size;
|
||||
};
|
||||
|
||||
struct sendResources {
|
||||
@@ -246,9 +246,11 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
|
||||
struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
|
||||
send->conn.tail = &recvMem->tail;
|
||||
send->conn.sizesFifo = recvMem->sizesFifo;
|
||||
for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
|
||||
send->conn.offsFifo = recvMem->offsFifo;
|
||||
send->conn.connFifo = recvMem->connFifo;
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
send->conn.connFifo[i].size = -1;
|
||||
send->conn.connFifo[i].mode = NCCL_MODE_OFFSET;
|
||||
}
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
|
||||
send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
|
||||
@@ -277,7 +279,10 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
|
||||
void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
|
||||
recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
|
||||
recv->conn.offsFifo = recvMem->offsFifo;
|
||||
recv->conn.connFifo = recvMem->connFifo;
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
recv->conn.connFifo[i].mode = NCCL_MODE_OFFSET;
|
||||
}
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
|
||||
@@ -478,7 +483,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
|
||||
resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
|
||||
// Don't give credits yet in shared mode.
|
||||
resources->sendMem->head = -NCCL_STEPS;
|
||||
(resources->gdcSync ? *resources->gdcSync : resources->sendMem->head) = -NCCL_STEPS;
|
||||
|
||||
// Allocate & Register shared buffers for the Simple protocol
|
||||
int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
|
||||
@@ -624,9 +629,49 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static size_t calcAlgoOffset(struct ncclProxyArgs* args, int isAllNotOne, int sub, uint64_t step) {
|
||||
int chunkSize = args->chunkSize;
|
||||
int nNodes = args->specifics.collnetDirect.nNodes;
|
||||
int node = args->specifics.collnetDirect.node;
|
||||
size_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
|
||||
size_t offset = (step*(args->nsubs) + sub)*chunkSize;
|
||||
if (isAllNotOne) {
|
||||
offset = std::min<size_t>(offset, nNodes*sizePerRank);
|
||||
} else {
|
||||
offset = std::max<size_t>(offset, (node+0)*sizePerRank);
|
||||
offset = std::min<size_t>(offset, (node+1)*sizePerRank);
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
#define LAST_OF_GROUP(s) \
|
||||
(s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1)
|
||||
static int calcRegionOffset(
|
||||
struct ncclProxyArgs* args, int isRecvNotSend, int sub, uint64_t step,
|
||||
int side // 0=begin, 1=end
|
||||
) {
|
||||
struct ncclCollNetSharedRes* collNet = args->subs[0].connection->collNet;
|
||||
int slotSize = collNet->buffSize/NCCL_STEPS;
|
||||
int chunkSize = args->chunkSize;
|
||||
int base = isRecvNotSend*NCCL_STEPS + (step%NCCL_STEPS);
|
||||
base *= collNet->nChannels*slotSize;
|
||||
if (args->coll == ncclFuncAllReduce) {
|
||||
return base + (sub+side)*chunkSize;
|
||||
} else {
|
||||
int isAllNotOne = isRecvNotSend ^ (args->coll == ncclFuncReduceScatter);
|
||||
int sub0 = sub - (sub%COLLNET_GROUP_NSUBS);
|
||||
size_t off = sub0*slotSize;
|
||||
off += calcAlgoOffset(args, isAllNotOne, sub+side, step)
|
||||
- calcAlgoOffset(args, isAllNotOne, sub0, step);
|
||||
return base + off;
|
||||
}
|
||||
}
|
||||
|
||||
#define LAST_OF_GROUP(args, s) \
|
||||
((s)%COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || (s) == (args)->nsubs-1)
|
||||
|
||||
static constexpr int calcStepsPerGroup(int nGroups) {
|
||||
//return NCCL_STEPS/nGroups;
|
||||
return NCCL_STEPS;
|
||||
}
|
||||
|
||||
static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
@@ -645,88 +690,117 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = NCCL_PROTO_SIMPLE;
|
||||
int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
|
||||
int perGroupSteps = NCCL_STEPS / nGroups;
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
|
||||
void* sendMhandle = resources->sendMhandles[p];
|
||||
void* recvMhandle = resources->recvMhandles[p];
|
||||
char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]);
|
||||
auto reqFifo = resources->reqFifo;
|
||||
int group = s/COLLNET_GROUP_NSUBS;
|
||||
int groupStart = s - (s%COLLNET_GROUP_NSUBS);
|
||||
|
||||
if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
int sharedBuffSlot = sub->posted%NCCL_STEPS;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset));
|
||||
resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize;
|
||||
resources->recvMem->connFifo[buffSlot].offset = calcRegionOffset(args, 0, s, sub->posted, 0);
|
||||
__sync_synchronize();
|
||||
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS));
|
||||
sub->posted += args->sliceSteps;
|
||||
*sendHead = sub->base + sub->posted - NCCL_STEPS;
|
||||
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
|
||||
}
|
||||
// Enforce sync between operations of the same group.
|
||||
bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->received == sub->received)) || (s && (sub-1)->received > sub->received));
|
||||
if (groupSync && sub->received < sub->posted && sub->received < sub->done + perGroupSteps) {
|
||||
if (sub->received < sub->posted && sub->received < sub->done + calcStepsPerGroup(nGroups)) {
|
||||
int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
|
||||
int sharedBuffSlot = sub->received%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->recvMem->sizesFifo;
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
volatile uint64_t* recvTail = &resources->recvMem->tail;
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]);
|
||||
if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)))) {
|
||||
// We have something to receive, let's check whether data is ready.
|
||||
int ready = 1;
|
||||
if (s == 0) {
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset));
|
||||
args->sharedBuff[sharedBuffSlot] = localBuff + offset;
|
||||
args->sharedSize[sharedBuffSlot] = args->chunkSize;
|
||||
if (connFifo[buffSlot].size != -1 && ((*recvTail > (sub->base+sub->received)))) {
|
||||
if (args->coll != ncclFuncAllReduce) {
|
||||
int sendBeg = calcRegionOffset(args, 0, s, sub->received, 0);
|
||||
int sendEnd = calcRegionOffset(args, 0, s, sub->received, 1);
|
||||
if (sendEnd-sendBeg != connFifo[buffSlot].size) {
|
||||
WARN("CollNet sizes: want=%d got=%ld", sendEnd-sendBeg, connFifo[buffSlot].size);
|
||||
return ncclInternalError;
|
||||
}
|
||||
}
|
||||
if (ready) {
|
||||
sizesFifo[buffSlot] = -1;
|
||||
sub->received += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
//continue;
|
||||
// flush HDP if not done
|
||||
if (resources->curr_hdp_reg && args->hdp_flushed < *recvTail) {
|
||||
args->hdp_flushed = *recvTail;
|
||||
*resources->curr_hdp_reg = 1;
|
||||
connFifo[buffSlot].size = -1;
|
||||
sub->received += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
}
|
||||
}
|
||||
// Enforce collective ordering of collnet ops.
|
||||
bool ordered = s==0 ? args->subs[args->nsubs-1].transmitted == sub->transmitted
|
||||
: sub->transmitted < (sub-1)->transmitted;
|
||||
if (ordered && (sub->transmitted < sub->received)) {
|
||||
if (LAST_OF_GROUP(args, s)) {
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
if (!reqFifo[group][buffSlot].turnIsSendNotRecv) continue;
|
||||
|
||||
ssize_t sizePerRank = 0;
|
||||
size_t allBeg = calcAlgoOffset(args, 1, groupStart, sub->transmitted);
|
||||
size_t allEnd = calcAlgoOffset(args, 1, s+1, sub->transmitted);
|
||||
int sendBeg = calcRegionOffset(args, 0, groupStart, sub->transmitted, 0);
|
||||
int sendEnd = calcRegionOffset(args, 0, s, sub->transmitted, 1);
|
||||
int recvBeg = calcRegionOffset(args, 1, groupStart, sub->transmitted, 0);
|
||||
int recvEnd = calcRegionOffset(args, 1, s, sub->transmitted, 1);
|
||||
reqFifo[group][buffSlot].size = recvEnd - recvBeg;
|
||||
size_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
|
||||
|
||||
if (sendBeg==sendEnd && recvBeg==recvEnd) {
|
||||
sub->requests[buffSlot] = nullptr; // trivally finished request
|
||||
} else {
|
||||
if (args->coll == ncclFuncAllReduce) {
|
||||
int count = (sendEnd-sendBeg)/eltSize;
|
||||
NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region+sendBeg, region+recvBeg, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
|
||||
} else {
|
||||
sizePerRank = args->specifics.collnetDirect.sizePerRank;
|
||||
if (args->coll == ncclFuncAllGather) {
|
||||
ncclNetSGE_v8_t recvParts;
|
||||
recvParts.mhandle = recvMhandle;
|
||||
recvParts.address = region + recvBeg;
|
||||
recvParts.size = allEnd - allBeg;
|
||||
NCCLCHECK(proxyState->ncclCollNet->iallgather(
|
||||
resources->collNetComm, region+sendBeg, 1, &recvParts,
|
||||
sizePerRank, allBeg, allEnd-allBeg,
|
||||
sendMhandle, sub->requests+buffSlot));
|
||||
} else {
|
||||
ncclNetSGE_v8_t sendParts;
|
||||
sendParts.mhandle = sendMhandle;
|
||||
sendParts.address = region + sendBeg;
|
||||
sendParts.size = allEnd - allBeg;
|
||||
NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
|
||||
resources->collNetComm, 1, &sendParts, region+recvBeg,
|
||||
sizePerRank, allBeg, allEnd-allBeg,
|
||||
(ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
|
||||
recvMhandle, sub->requests+buffSlot));
|
||||
}
|
||||
}
|
||||
if (sub->requests[buffSlot] == nullptr) continue;
|
||||
|
||||
if (args->coll == ncclFuncAllReduce) {
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallreduce posted, size %d req %p", (long)sub->transmitted, group, buffSlot, int(sendEnd-sendBeg), sub->requests[buffSlot]);
|
||||
} else if (args->coll == ncclFuncAllGather) {
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallgather posted sendSize=%ld recvOffset=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(sizePerRank), long(allBeg), long(allEnd-allBeg), sub->requests[buffSlot]);
|
||||
} else {
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Ireducescatter posted sendOffset=%ld sendSize=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(allBeg), long(allEnd-allBeg), long(sizePerRank), sub->requests[buffSlot]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (LAST_OF_GROUP(s) && (sub->transmitted < sub->received)) {
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
|
||||
if (reqFifo[group][buffSlot].recvBuff != NULL) {
|
||||
int totalSize = (s-group*COLLNET_GROUP_NSUBS+1) * args->sharedSize[sharedBuffSlot];
|
||||
int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype);
|
||||
reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot];
|
||||
char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot];
|
||||
NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
|
||||
if (sub->requests[buffSlot] == NULL) continue;
|
||||
|
||||
TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
sub->transmitted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
continue;
|
||||
}
|
||||
sub->transmitted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
continue;
|
||||
}
|
||||
// Check whether the network has completed some send operations.
|
||||
if (LAST_OF_GROUP(s) && sub->done < sub->transmitted) {
|
||||
if (LAST_OF_GROUP(args, s) && sub->done < sub->transmitted) {
|
||||
int done, size;
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
||||
NCCLCHECK(proxyState->ncclCollNet->test((void*)(sub->requests[buffSlot]), &done, &size));
|
||||
done = 1;
|
||||
if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test((void*)(sub->requests[buffSlot]), &done, &size));
|
||||
if (done) {
|
||||
TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size);
|
||||
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
|
||||
// (reordered store after store is possible on POWER, though not on x86)
|
||||
__sync_synchronize();
|
||||
reqFifo[group][buffSlot].recvBuff = NULL; // Notify recvProxy
|
||||
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].done += args->sliceSteps;
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] request %p done, size %d", (long)sub->done, group, buffSlot, sub->requests[buffSlot], size);
|
||||
sub->requests[buffSlot] = nullptr;
|
||||
reqFifo[group][buffSlot].turnIsSendNotRecv = false; // Notify recvProxy
|
||||
for (int i=groupStart; i<=s; i++) args->subs[i].done += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
int allDone = 1;
|
||||
for (int i=0; i<args->nsubs; i++) {
|
||||
@@ -734,7 +808,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
}
|
||||
if (allDone) {
|
||||
args->state = ncclProxyOpNone;
|
||||
TRACE(NCCL_NET, "sendProxy [%lu/%d] stopped", sub->done, s);
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] stopped", (long)sub->done, s);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -752,6 +826,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
sub->base = ROUNDUP(resources->step, args->chunkSteps);
|
||||
sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0;
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
memset(sub->requests, 0, sizeof(sub->requests));
|
||||
}
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
@@ -759,38 +834,32 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = NCCL_PROTO_SIMPLE;
|
||||
int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
|
||||
int perGroupSteps = NCCL_STEPS / nGroups;
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
int group = s/COLLNET_GROUP_NSUBS;
|
||||
int groupStart = s - (s%COLLNET_GROUP_NSUBS);
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
|
||||
void* mhandle = resources->mhandles[p];
|
||||
auto reqFifo = resources->reqFifo;
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
|
||||
// Enforce sync between operations of the same group.
|
||||
if (LAST_OF_GROUP(s) && (sub->posted < sub->done + perGroupSteps) && (sub->posted < sub->nsteps)) {
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
if (LAST_OF_GROUP(args, s) && (sub->posted < sub->done + calcStepsPerGroup(nGroups)) && (sub->posted < sub->nsteps)) {
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
int sharedBuffSlot = sub->posted%NCCL_STEPS;
|
||||
int startChannel = group*COLLNET_GROUP_NSUBS;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
|
||||
reqFifo[group][buffSlot].recvBuff = localBuff + offset;
|
||||
TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff);
|
||||
reqFifo[group][buffSlot].turnIsSendNotRecv = true;
|
||||
TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] posted buffer", (long)sub->posted, group, buffSlot);
|
||||
sub->posted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
continue;
|
||||
}
|
||||
if (LAST_OF_GROUP(s) && (sub->posted > sub->received)) {
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
if (LAST_OF_GROUP(args, s) && (sub->received < sub->posted)) {
|
||||
int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
|
||||
int sharedBuffSlot = sub->received%NCCL_STEPS;
|
||||
if (reqFifo[group][buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete
|
||||
args->sharedSize[sharedBuffSlot] = reqFifo[group][buffSlot].size;
|
||||
int totalSize = args->sharedSize[sharedBuffSlot]*(s-group*COLLNET_GROUP_NSUBS+1);
|
||||
TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize);
|
||||
if (!reqFifo[group][buffSlot].turnIsSendNotRecv) { // Buffer is cleared : coll is complete
|
||||
int recvBeg = calcRegionOffset(args, 1, groupStart, sub->received, 0);
|
||||
int recvEnd = calcRegionOffset(args, 1, s, sub->received, 1);
|
||||
int totalSize = recvEnd - recvBeg;
|
||||
TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %d chunkSize=%d", (long)sub->received, group, buffSlot, totalSize, args->chunkSize);
|
||||
sub->received += args->sliceSteps;
|
||||
sub->requests[buffSlot] = NULL;
|
||||
if (reqFifo[group][buffSlot].size > 0 && resources->useGdr && resources->needFlush) {
|
||||
// GDRCOPY support
|
||||
if (resources->gdcFlush) {
|
||||
@@ -801,42 +870,31 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
WARN("NET: GDR Flush only supported on x86_64");
|
||||
return ncclInternalError;
|
||||
#endif
|
||||
sub->requests[buffSlot] = NULL;
|
||||
} else {
|
||||
int startChannel = group*COLLNET_GROUP_NSUBS;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
|
||||
NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
|
||||
NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region+recvBeg, totalSize, mhandle, sub->requests+buffSlot));
|
||||
}
|
||||
} else {
|
||||
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
|
||||
}
|
||||
args->idle = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (LAST_OF_GROUP(s) && (sub->received > sub->flushed)) {
|
||||
if (LAST_OF_GROUP(args, s) && (sub->flushed < sub->received)) {
|
||||
// Progress flush operations
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS;
|
||||
int done = 1;
|
||||
if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test(sub->requests[buffSlot], &done, NULL));
|
||||
if (done) {
|
||||
TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] flushed", sub->flushed, group, buffSlot);
|
||||
sub->requests[buffSlot] = nullptr;
|
||||
TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] flushed", (long)sub->flushed, group, buffSlot);
|
||||
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
//continue;
|
||||
}
|
||||
}
|
||||
if (sub->flushed > sub->transmitted) {
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
if (sub->transmitted < sub->flushed) {
|
||||
int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
|
||||
int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
|
||||
int startChannel = group*COLLNET_GROUP_NSUBS;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
|
||||
volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
|
||||
offsFifo[buffSlot] = offset + (s%COLLNET_GROUP_NSUBS)*args->chunkSize;
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
connFifo[buffSlot].offset = calcRegionOffset(args, 1, s, sub->transmitted, 0);
|
||||
__sync_synchronize();
|
||||
volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
|
||||
*recvTail = sub->base + sub->flushed;
|
||||
@@ -848,14 +906,15 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
// Enforce sync here to make sure the last sub doesn't increase "done" before all others in the group have
|
||||
// reached the same point, otherwise we would start posting buffers to the send proxy before we're done
|
||||
// processing all the shared buffer.
|
||||
bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->done == sub->done)) || (s && (sub-1)->done > sub->done));
|
||||
bool groupSync = s==0 ? args->subs[args->nsubs-1].done == sub->done
|
||||
: (sub-1)->done > sub->done;
|
||||
volatile uint64_t* sendHead = &resources->sendMem->head;
|
||||
if (groupSync && sub->done < sub->transmitted && (sub->base+sub->done) < *sendHead) {
|
||||
sub->done += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
if (sub->done == sub->nsteps && s == args->nsubs-1) {
|
||||
args->state = ncclProxyOpNone;
|
||||
TRACE(NCCL_NET, "recvProxy [%lu/%d] stopped", sub->done, s);
|
||||
TRACE(NCCL_NET, "recvProxy [%ld/%d] stopped", (long)sub->done, s);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -868,4 +927,4 @@ struct ncclTransport collNetTransport = {
|
||||
canConnect,
|
||||
{ sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
|
||||
{ recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
|
||||
};
|
||||
};
|
||||
@@ -374,9 +374,12 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
|
||||
struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
|
||||
send->conn.tail = &recvMem->tail;
|
||||
send->conn.sizesFifo = recvMem->sizesFifo;
|
||||
send->conn.connFifo = recvMem->connFifo;
|
||||
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
|
||||
send->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL;
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
send->conn.connFifo[i].offset = -1;
|
||||
recvMem->connFifo[i].mode = map->shared ? NCCL_MODE_OFFSET : NCCL_MODE_NORMAL;
|
||||
}
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
|
||||
send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
|
||||
@@ -436,9 +439,11 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
|
||||
void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
|
||||
recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
|
||||
recv->conn.sizesFifo = recvMem->sizesFifo;
|
||||
recv->conn.connFifo = recvMem->connFifo;
|
||||
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
|
||||
recv->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL;
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
recvMem->connFifo[i].mode = map->shared ? NCCL_MODE_OFFSET : NCCL_MODE_NORMAL;
|
||||
}
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
|
||||
recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
|
||||
@@ -548,10 +553,11 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset) {
|
||||
static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset, int* size) {
|
||||
// Use different pools for different channels and also separate send/recv.
|
||||
int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
|
||||
*offset = proxyState->p2pChunkSize * globalSlot;
|
||||
if (size) *size = proxyState->p2pChunkSize;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -802,8 +808,9 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
|
||||
|
||||
// Don't give credits yet in shared mode.
|
||||
resources->sendMem->head = map->shared ? -NCCL_STEPS : 0;
|
||||
for (int i=0; i<NCCL_STEPS; i++) resources->recvMem->sizesFifo[i] = -1;
|
||||
(resources->gdcSync ? *resources->gdcSync : resources->sendMem->head) =
|
||||
(map->shared ? -NCCL_STEPS : 0);
|
||||
for (int i=0; i<NCCL_STEPS; i++) resources->recvMem->connFifo[i].size = -1;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
|
||||
@@ -1099,6 +1106,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
|
||||
}
|
||||
|
||||
static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
|
||||
#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
static int g_npkit_net_poll_cnt = 0;
|
||||
@@ -1114,8 +1122,15 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources);
|
||||
// Round to next multiple of sliceSteps
|
||||
sub->base = ROUNDUP(resources->step, args->chunkSteps);
|
||||
// Set step base for next op
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
sub->posted = sub->transmitted = sub->done = 0;
|
||||
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
|
||||
if (sub->reg && sub->nbytes > 0) {
|
||||
NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->buffer, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
|
||||
} else {
|
||||
sub->mhandle = resources->mhandles[args->protocol];
|
||||
}
|
||||
}
|
||||
args->state = ncclProxyOpProgress;
|
||||
args->hdp_flushed = 0;
|
||||
@@ -1128,23 +1143,24 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
if (sub->done == sub->nsteps) continue;
|
||||
struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources);
|
||||
void* mhandle = resources->mhandles[p];
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
int stepSize = resources->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
int buffSize = stepSize*args->sliceSteps;
|
||||
if (sub->nbytes < buffSize) buffSize = sub->nbytes;
|
||||
// Post buffers to the GPU
|
||||
if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
if (resources->shared) {
|
||||
int sharedBuffSlot = sub->posted%maxDepth;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset));
|
||||
resources->recvMem->offsFifo[buffSlot] = offset;
|
||||
__sync_synchronize();
|
||||
if (!sub->reg) {
|
||||
int sharedBuffSlot = sub->posted%maxDepth;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset, NULL));
|
||||
resources->recvMem->connFifo[buffSlot].offset = offset;
|
||||
__sync_synchronize();
|
||||
}
|
||||
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
|
||||
sub->posted += args->sliceSteps;
|
||||
*sendHead = sub->base + sub->posted - NCCL_STEPS;
|
||||
// Only post one credit for registered buffer
|
||||
if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
|
||||
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
|
||||
} else sub->posted += args->sliceSteps;
|
||||
for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) {
|
||||
@@ -1158,14 +1174,15 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->recvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->recvMem->tail;
|
||||
if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
|
||||
uint64_t tail = sub->base + (sub->reg ? 0 : sub->transmitted);
|
||||
if ((sub->reg || connFifo[buffSlot].size != -1) && ((*recvTail > tail) || p == NCCL_PROTO_LL)) {
|
||||
// We have something to receive, let's check if it's completely ready.
|
||||
int size = sizesFifo[buffSlot];
|
||||
int size = sub->reg ? std::min(MAX_NET_SIZE, sub->nbytes) : connFifo[buffSlot].size;
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
sub->npKitSizesFifo[buffSlot] = size;
|
||||
#endif
|
||||
bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared;
|
||||
char* buff = shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
|
||||
char* buff = shared ? localBuff+connFifo[buffSlot].offset : localBuff+buffSlot*stepSize;
|
||||
int ready = 1;
|
||||
if (p == NCCL_PROTO_LL128) {
|
||||
ready = resources->useGdr;
|
||||
@@ -1173,7 +1190,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
// When data is in sysmem, we need to wait until all flags are correct since the GPU only
|
||||
// called threadfence()
|
||||
uint64_t flag = sub->base+sub->transmitted+1;
|
||||
int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
|
||||
int nFifoLines = DIVUP(connFifo[buffSlot].size, sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
|
||||
volatile uint64_t* lines = (volatile uint64_t*)buff;
|
||||
ready = 1;
|
||||
for (int i=0; i<nFifoLines; i++) {
|
||||
@@ -1189,6 +1206,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
volatile uint32_t *f2 = &lines[i].flag2;
|
||||
if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
|
||||
}
|
||||
} else if (p == NCCL_PROTO_SIMPLE && resources->shared) {
|
||||
buff = sub->reg ? (char*)sub->buffer : localBuff+resources->recvMem->connFifo[buffSlot].offset;
|
||||
}
|
||||
if (ready) {
|
||||
// flush HDP if not done
|
||||
@@ -1197,7 +1216,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
*resources->curr_hdp_reg = 1;
|
||||
}
|
||||
// Data is ready, try to send.
|
||||
NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, mhandle, sub->requests+buffSlot));
|
||||
NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot));
|
||||
if (sub->requests[buffSlot] != NULL) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
@@ -1231,12 +1250,13 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
// Check whether the network has completed some send operations.
|
||||
if (sub->done < sub->transmitted) {
|
||||
int done;
|
||||
int size;
|
||||
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
if (sub->timestamp[buffSlot] == 0)
|
||||
sub->timestamp[buffSlot] = *(volatile uint64_t*)NpKit::GetCpuTimestamp();
|
||||
#endif
|
||||
NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, NULL));
|
||||
NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, &size));
|
||||
if (done) {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
NpKit::CollectCpuEvent(
|
||||
@@ -1280,28 +1300,48 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
#endif
|
||||
if (sub->reg) {
|
||||
if (size < sub->nbytes) {
|
||||
sub->buffer = ((char*)sub->buffer)+size;
|
||||
sub->nbytes -= size;
|
||||
// Do one more step (at least)
|
||||
sub->nsteps++;
|
||||
} else {
|
||||
// Signal the GPU the send is complete and it can return.
|
||||
connFifo[sub->base%NCCL_STEPS].size = -1;
|
||||
}
|
||||
}
|
||||
// Make sure size is reset to -1 before we update the head.
|
||||
if (sub->reg == 0) connFifo[buffSlot].size = -1;
|
||||
__sync_synchronize();
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
|
||||
sub->done += args->sliceSteps;
|
||||
for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
|
||||
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
|
||||
sub->done += args->sliceSteps;
|
||||
for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
|
||||
|
||||
if (resources->shared == 0) {
|
||||
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
|
||||
*sendHead = sub->base + sub->done;
|
||||
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
|
||||
}
|
||||
args->idle = 0;
|
||||
if (sub->done == sub->nsteps) {
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
args->done++;
|
||||
if (resources->shared == 0) {
|
||||
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
|
||||
if (sub->reg) {
|
||||
// We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
|
||||
if (sub->done == sub->nsteps) *sendHead = sub->base + args->sliceSteps;
|
||||
} else {
|
||||
*sendHead = sub->base + sub->done;
|
||||
}
|
||||
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
|
||||
}
|
||||
args->idle = 0;
|
||||
if (sub->done == sub->nsteps) {
|
||||
if (sub->reg && sub->nbytes > 0) {
|
||||
NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, sub->mhandle));
|
||||
}
|
||||
args->done++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (args->done == args->nsubs) {
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
}
|
||||
if (args->done == args->nsubs) {
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -1339,9 +1379,17 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
recvComm = resources->netRecvComm;
|
||||
// Round to next multiple of sliceSteps
|
||||
sub->base = ROUNDUP(resources->step, args->chunkSteps);
|
||||
// Set step base for next op
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
sub->posted = sub->received = sub->transmitted = sub->done = 0;
|
||||
for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
|
||||
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
|
||||
if (sub->reg && sub->nbytes > 0) {
|
||||
// Register buffer
|
||||
NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->buffer, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
|
||||
} else {
|
||||
sub->mhandle = resources->mhandles[args->protocol];
|
||||
}
|
||||
}
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
@@ -1356,29 +1404,37 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
int sizes[NCCL_PROXY_MAX_SUBS];
|
||||
int tags[NCCL_PROXY_MAX_SUBS];
|
||||
void* mhandles[NCCL_PROXY_MAX_SUBS];
|
||||
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup + i;
|
||||
if (sub->posted < sub->nsteps) {
|
||||
if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
if (sub->reg) maxDepth = 1;
|
||||
int stepSize = resources->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
if (p == NCCL_PROTO_SIMPLE && resources->shared) {
|
||||
int sharedBuffSlot = sub->posted%maxDepth;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
|
||||
volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
|
||||
offsFifo[buffSlot] = offset;
|
||||
ptrs[subCount] = localBuff+offset;
|
||||
if (sub->reg) {
|
||||
// Wait until CUDA kernel has started before we access the user buffer directly.
|
||||
if (connFifo[sub->base%NCCL_STEPS].size == -1) continue;
|
||||
ptrs[subCount] = sub->buffer;
|
||||
sizes[subCount] = std::min(MAX_NET_SIZE, sub->nbytes);
|
||||
} else {
|
||||
int sharedBuffSlot = sub->posted%maxDepth;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset, sizes+subCount));
|
||||
connFifo[buffSlot].offset = offset;
|
||||
ptrs[subCount] = localBuff+offset;
|
||||
}
|
||||
} else {
|
||||
ptrs[subCount] = localBuff+buffSlot*stepSize;
|
||||
sizes[subCount] = stepSize*args->sliceSteps;
|
||||
}
|
||||
sizes[subCount] = stepSize*args->sliceSteps;
|
||||
if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
|
||||
tags[subCount] = resources->tpRemoteRank;
|
||||
mhandles[subCount] = resources->mhandles[p];
|
||||
mhandles[subCount] = sub->mhandle;
|
||||
subCount++;
|
||||
}
|
||||
}
|
||||
@@ -1430,6 +1486,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (done) {
|
||||
int needFlush = 0;
|
||||
int totalSize = 0;
|
||||
int subIndex = 0;
|
||||
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup + i;
|
||||
@@ -1449,6 +1506,23 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (sub->received < sub->nsteps) {
|
||||
int size = sizes[subIndex++];
|
||||
if (sub->reg) {
|
||||
if (size < sub->nbytes) {
|
||||
sub->buffer = ((char*)sub->buffer) + size;
|
||||
sub->nbytes -= size;
|
||||
// Do one more step (at least)
|
||||
sub->nsteps++;
|
||||
} else {
|
||||
// Reset connFifo size indicating the GPU was ready to receive.
|
||||
// There is a __sync_synchronize() later to ensure it is reset before it is set again by the GPU.
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
connFifo[sub->base%NCCL_STEPS].size = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
sub->received += args->sliceSteps;
|
||||
for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
|
||||
if (step < sub->nsteps) {
|
||||
@@ -1476,9 +1550,11 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
int stepSize = resources->buffSizes[p] / NCCL_STEPS;
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
ptrs[subCount] = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
|
||||
mhandles[subCount] = resources->mhandles[p];
|
||||
int buffSlot = (sub->base+sub->received-args->sliceSteps)%NCCL_STEPS;
|
||||
ptrs[subCount] = resources->shared ?
|
||||
(sub->reg ? sub->buffer : localBuff+resources->recvMem->connFifo[buffSlot].offset) :
|
||||
localBuff+buffSlot*stepSize;
|
||||
mhandles[subCount] = sub->mhandle;
|
||||
subCount++;
|
||||
}
|
||||
}
|
||||
@@ -1502,13 +1578,18 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (done) {
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup + i;
|
||||
|
||||
sub->transmitted += args->sliceSteps;
|
||||
for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait);
|
||||
if (step < sub->nsteps) {
|
||||
__sync_synchronize();
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
|
||||
*recvTail = sub->base + sub->transmitted;
|
||||
if (sub->reg) {
|
||||
// We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
|
||||
if (sub->transmitted == sub->nsteps) *recvTail = sub->base + args->sliceSteps;
|
||||
} else
|
||||
*recvTail = sub->base + sub->transmitted;
|
||||
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
|
||||
}
|
||||
}
|
||||
@@ -1526,7 +1607,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (sub->transmitted > sub->done) {
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
volatile uint64_t* sendHead = &resources->sendMem->head;
|
||||
uint64_t done = *sendHead;
|
||||
uint64_t done = sub->reg ? sub->base + sub->nsteps : *sendHead;
|
||||
while (done > sub->base + sub->done &&
|
||||
// LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
|
||||
sub->transmitted > sub->done) {
|
||||
@@ -1541,7 +1622,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
args->idle = 0;
|
||||
if (sub->done == sub->nsteps) {
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
if (sub->reg && sub->nbytes > 0) {
|
||||
NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, sub->mhandle));
|
||||
}
|
||||
args->done++;
|
||||
break;
|
||||
}
|
||||
@@ -1561,4 +1644,4 @@ struct ncclTransport netTransport = {
|
||||
canConnect,
|
||||
{ sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
|
||||
{ recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
|
||||
};
|
||||
};
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -96,6 +96,7 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
|
||||
props->pciPath = ncclNetSocketDevs[dev].pciPath;
|
||||
props->guid = dev;
|
||||
props->ptrSupport = NCCL_PTR_HOST;
|
||||
props->regIsGlobal = 0;
|
||||
NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed));
|
||||
props->latency = 0; // Not set
|
||||
props->port = 0;
|
||||
@@ -534,7 +535,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
|
||||
ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
|
||||
}
|
||||
ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "utils.h"
|
||||
#include "proxy.h"
|
||||
#include "enqueue.h"
|
||||
#include "register.h"
|
||||
|
||||
#if CUDART_VERSION >= 12010
|
||||
|
||||
@@ -20,19 +21,8 @@ struct graphRegData {
|
||||
};
|
||||
|
||||
struct localRegData {
|
||||
/* Registration record data */
|
||||
uintptr_t recSendbuff, recRecvbuff;
|
||||
intptr_t recSendOffset, recRecvOffset;
|
||||
/* Registration request data */
|
||||
uintptr_t reqSendbuff, reqRecvbuff;
|
||||
size_t reqSendSize, reqRecvSize;
|
||||
intptr_t reqSendOffset, reqRecvOffset;
|
||||
};
|
||||
|
||||
struct localRequestData {
|
||||
uintptr_t reqBuff;
|
||||
size_t reqSize;
|
||||
intptr_t reqOffset;
|
||||
struct ncclReg reg;
|
||||
intptr_t offset;
|
||||
};
|
||||
|
||||
ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
@@ -116,11 +106,9 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int
|
||||
// cuMem UDS support
|
||||
int fd = -1;
|
||||
TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle %p from rank %d", comm->localRank, shareableHandle, rank);
|
||||
struct ncclProxyConnector proxyConn;
|
||||
int tpProxyRank = comm->topParentRanks[rank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &proxyConn));
|
||||
TRACE(NCCL_NVLS, "NVLS rank %d request conversion of handle 0x%lx from rank %d", comm->localRank, *(uint64_t*)shareableHandle, rank);
|
||||
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, &proxyConn, shareableHandle, &fd));
|
||||
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpProxyRank, shareableHandle, &fd));
|
||||
TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
|
||||
CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type));
|
||||
(void) close(fd);
|
||||
@@ -248,7 +236,8 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
|
||||
|
||||
int gpuCount;
|
||||
NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
|
||||
if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess;
|
||||
// NVLS is not supported on MNNVL yet
|
||||
if (!ncclParamNvlsEnable() || gpuCount <= 2 || comm->nNodes > 1 || comm->MNNVL) return ncclSuccess;
|
||||
|
||||
CUdevice dev;
|
||||
int driverVersion;
|
||||
@@ -292,14 +281,14 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
if (nvlsShare) {
|
||||
/* reuse NVLS resources */
|
||||
comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
|
||||
for (int c = 0; c < comm->nvlsChannels; c++) {
|
||||
for (int c = 0; c < comm->nChannels; c++) {
|
||||
NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, cleanup);
|
||||
}
|
||||
|
||||
comm->nvlsResources = parent->nvlsResources;
|
||||
ncclAtomicRefCountIncrement(&parent->nvlsResources->refCount);
|
||||
} else {
|
||||
int nChannels;
|
||||
int nChannels = comm->nChannels;
|
||||
struct ncclNvlsSharedRes* resources;
|
||||
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
@@ -312,7 +301,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
|
||||
}
|
||||
|
||||
nChannels = resources->nChannels = comm->nvlsChannels;
|
||||
resources->nChannels = comm->nvlsChannels;
|
||||
for (int c = 0; c < nChannels; c++) {
|
||||
NCCLCHECK(initNvlsChannel(comm, c, parent, false));
|
||||
}
|
||||
@@ -390,7 +379,8 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
}
|
||||
|
||||
/* create shared memory for fast NVLS buffer registration */
|
||||
typeSize = sizeof(struct localRegData);
|
||||
typeSize = sizeof(struct localRegData) << 1;
|
||||
|
||||
if (comm->localRank == 0) {
|
||||
shmPath[0] = '\0';
|
||||
NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, cleanup);
|
||||
@@ -405,6 +395,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
comm->nvlsResources->nvlsShmem.cnt[1] = (size_t*)((char*)comm->nvlsResources->nvlsShmem.ptr[0] + typeSize * comm->localRanks);
|
||||
comm->nvlsResources->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[1] + sizeof(size_t));
|
||||
comm->nvlsResources->nvlsShmem.round = 0;
|
||||
comm->nvlsResources->nvlsShmem.maxTypeSize = typeSize;
|
||||
|
||||
return res;
|
||||
|
||||
@@ -427,23 +418,59 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t tryRegisterBuffer(struct ncclComm *comm, struct localRequestData *reqData, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) {
|
||||
ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclRegRecord *regRecord = NULL;
|
||||
struct localRequestData *myReqData = &reqData[comm->localRank];
|
||||
struct ncclReg *regRecord = NULL;
|
||||
CUdeviceptr regPtr = 0;
|
||||
CUmulticastObjectProp prop;
|
||||
char shareableHandle[NVLS_HANDLE_SIZE];
|
||||
CUmemGenericAllocationHandle mcHandle;
|
||||
size_t granularity;
|
||||
size_t minSize;
|
||||
size_t minSize = SIZE_MAX;
|
||||
bool localRegBufUsed = false;
|
||||
struct localRegData* regData = NULL;
|
||||
cudaPointerAttributes attr;
|
||||
|
||||
/* get minimal size of nvls buffers */
|
||||
minSize = reqData[0].reqSize;
|
||||
for (int i = 1; i < comm->localRanks; ++i) {
|
||||
if (minSize > reqData[i].reqSize)
|
||||
minSize = reqData[i].reqSize;
|
||||
NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks), ret, fail);
|
||||
|
||||
if (userBuff) {
|
||||
NCCLCHECKGOTO(ncclRegFind(comm, (void*)userBuff, buffSize, ®Record), ret, fail);
|
||||
if (regRecord) {
|
||||
CUDACHECK(cudaPointerGetAttributes(&attr, (void*)regRecord->addr));
|
||||
if (attr.type == cudaMemoryTypeDevice) {
|
||||
size_t regSize = regRecord->pages * comm->regCache.pageSize;
|
||||
prop = comm->nvlsResources->properties;
|
||||
prop.size = regSize;
|
||||
CUCHECK(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
|
||||
CUCHECK(cuMemGetAddressRange((CUdeviceptr*)®Record->baseAddr, ®Record->baseSize, (CUdeviceptr)regRecord->addr));
|
||||
if (regSize % granularity == 0) {
|
||||
regRecord->regSize = regSize;
|
||||
} else {
|
||||
regRecord->regSize = regRecord->baseSize - (regRecord->addr - regRecord->baseAddr);
|
||||
}
|
||||
|
||||
if (regRecord->addr % comm->nvlsResources->ucGran == 0 && regRecord->regSize % granularity == 0) {
|
||||
regRecord->state |= NVLS_REG_POSSIBLE;
|
||||
memcpy(®Data[comm->localRank].reg, regRecord, sizeof(struct ncclReg));
|
||||
regData[comm->localRank].offset = userBuff - regRecord->addr;
|
||||
}
|
||||
}
|
||||
|
||||
if ((regRecord->state & NVLS_REG_POSSIBLE) == 0) {
|
||||
regRecord->state |= NVLS_REG_NO_SUPPORT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank, regData, sizeof(struct localRegData)), ret, fail);
|
||||
|
||||
for (int i = 0; i < comm->localRanks; ++i) {
|
||||
if ((regData[i].reg.state & NVLS_REG_POSSIBLE) == 0) {
|
||||
goto fail;
|
||||
}
|
||||
/* get minimal reg size of nvls buffers */
|
||||
if (minSize > regData[i].reg.regSize)
|
||||
minSize = regData[i].reg.regSize;
|
||||
}
|
||||
|
||||
/* start registration */
|
||||
@@ -459,7 +486,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, struct localRequestData *r
|
||||
}
|
||||
|
||||
CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail);
|
||||
CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)myReqData->reqBuff, minSize, 0), ret, fail);
|
||||
CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail);
|
||||
|
||||
// Create a VA for the NVLS
|
||||
CUCHECKGOTO(cuMemAddressReserve(®Ptr, minSize, granularity, 0U, 0), ret, fail);
|
||||
@@ -467,26 +494,28 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, struct localRequestData *r
|
||||
CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail);
|
||||
CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(®Record, 1), ret, fail);
|
||||
regRecord->buff = myReqData->reqBuff;
|
||||
regRecord->size = myReqData->reqSize;
|
||||
regRecord->regAddr = regPtr;
|
||||
regRecord->regSize = minSize;
|
||||
regRecord->dev = comm->nvlsResources->dev;
|
||||
regRecord->mcHandle = mcHandle;
|
||||
regRecord->state |= NVLS_REG_COMPLETE;
|
||||
/* get all buffer addresses */
|
||||
NCCLCHECKGOTO(ncclCalloc(®Record->addrs, comm->localRanks), ret, fail);
|
||||
regRecord->addrs[comm->localRank] = regRecord->buff;
|
||||
NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->addrs + comm->localRank, regRecord->addrs, sizeof(uintptr_t)), ret, fail);
|
||||
/* enqueue record */
|
||||
ncclIntruQueueEnqueue(&comm->regRecordQueue, regRecord);
|
||||
regRecord->caddrs[comm->localRank] = regRecord->addr;
|
||||
NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->caddrs + comm->localRank, regRecord->caddrs, sizeof(uintptr_t)), ret, fail);
|
||||
|
||||
/* Although registration is done, we still need to check whether the offsets are same among ranks. */
|
||||
for (int i = 0; i < comm->localRanks - 1; ++i) {
|
||||
if (regData[i].offset != regData[i + 1].offset) {
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
localRegBufUsed = true;
|
||||
|
||||
exit:
|
||||
if (localRegBufUsed)
|
||||
*regAddr = (uintptr_t)regPtr + userBuff - myReqData->reqBuff;
|
||||
if (localRegBufUsed) *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
|
||||
*regUsed = localRegBufUsed;
|
||||
free(regData);
|
||||
return ret;
|
||||
fail:
|
||||
localRegBufUsed = false;
|
||||
@@ -497,77 +526,52 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
bool localRegBufUsed = false;
|
||||
struct localRegData *regData = NULL;
|
||||
struct localRequestData *reqData = NULL;
|
||||
struct ncclRegRecord *regRecordHead = NULL, *sendRegRecord = NULL, *recvRegRecord = NULL;
|
||||
struct ncclRegRequest *regRequestHead = NULL, *sendRegRequest = NULL, *recvRegRequest = NULL;
|
||||
bool sendNeedReg = false, recvNeedReg = false;
|
||||
CUdeviceptr regSendPtr = 0;
|
||||
CUdeviceptr regRecvPtr = 0;
|
||||
struct ncclReg *sendRegRecord = NULL;
|
||||
struct ncclReg *recvRegRecord = NULL;
|
||||
|
||||
*outRegBufUsed = false;
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks * 2), ret, fail);
|
||||
|
||||
/* first check whether the buffer has been registered and matches each other globally */
|
||||
regRecordHead = ncclIntruQueueHead(&comm->regRecordQueue);
|
||||
while (regRecordHead && ((sendRegRecord == NULL && sendbuff != NULL) || (recvRegRecord == NULL && recvbuff != NULL))) {
|
||||
/* check send reg record */
|
||||
if (sendRegRecord == NULL && regRecordHead->buff <= (uintptr_t)sendbuff &&
|
||||
regRecordHead->buff + regRecordHead->size >= (uintptr_t)sendbuff + sendbuffSize) {
|
||||
regData[comm->localRank].recSendbuff = regRecordHead->buff;
|
||||
regData[comm->localRank].recSendOffset = (uintptr_t)sendbuff - regRecordHead->buff;
|
||||
sendRegRecord = regRecordHead;
|
||||
if (sendbuff) {
|
||||
NCCLCHECKGOTO(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord), ret, fail);
|
||||
if (sendRegRecord) {
|
||||
memcpy(®Data[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg));
|
||||
regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr;
|
||||
}
|
||||
|
||||
/* check recv reg record */
|
||||
if (recvRegRecord == NULL && regRecordHead->buff <= (uintptr_t)recvbuff &&
|
||||
regRecordHead->buff + regRecordHead->size >= (uintptr_t)recvbuff + recvbuffSize) {
|
||||
regData[comm->localRank].recRecvbuff = regRecordHead->buff;
|
||||
regData[comm->localRank].recRecvOffset = (uintptr_t)recvbuff - regRecordHead->buff;
|
||||
recvRegRecord = regRecordHead;
|
||||
}
|
||||
regRecordHead = regRecordHead->next;
|
||||
}
|
||||
|
||||
/* prepare registration request for later reference */
|
||||
regRequestHead = ncclIntruQueueHead(&comm->regRequestQueue);
|
||||
while (regRequestHead && ((sendRegRequest == NULL && sendbuff != NULL) || (recvRegRequest == NULL && recvbuff != NULL))) {
|
||||
/* check send reg request */
|
||||
if (regRequestHead->buff <= (uintptr_t)sendbuff &&
|
||||
regRequestHead->buff + regRequestHead->size >= (uintptr_t)sendbuff + sendbuffSize) {
|
||||
regData[comm->localRank].reqSendbuff = regRequestHead->buff;
|
||||
regData[comm->localRank].reqSendSize = regRequestHead->size;
|
||||
regData[comm->localRank].reqSendOffset = (uintptr_t)sendbuff - regRequestHead->buff;
|
||||
sendRegRequest = regRequestHead;
|
||||
if (recvbuff) {
|
||||
NCCLCHECKGOTO(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord), ret, fail);
|
||||
if (recvRegRecord) {
|
||||
memcpy(®Data[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg));
|
||||
regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr;
|
||||
}
|
||||
|
||||
/* check recv reg request */
|
||||
if (regRequestHead->buff <= (uintptr_t)recvbuff &&
|
||||
regRequestHead->buff + regRequestHead->size >= (uintptr_t)recvbuff + recvbuffSize) {
|
||||
regData[comm->localRank].reqRecvbuff = regRequestHead->buff;
|
||||
regData[comm->localRank].reqRecvSize = regRequestHead->size;
|
||||
regData[comm->localRank].reqRecvOffset = (uintptr_t)recvbuff - regRequestHead->buff;
|
||||
recvRegRequest = regRequestHead;
|
||||
}
|
||||
regRequestHead = regRequestHead->next;
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank, regData, sizeof(struct localRegData)), ret, fail);
|
||||
NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank * 2, regData, sizeof(struct localRegData) * 2), ret, fail);
|
||||
|
||||
/* first check whether all local ranks find their registered buffer */
|
||||
for (int i = 0; i < comm->localRanks; ++i) {
|
||||
if (regData[i].recSendbuff == 0 || sendRegRecord->addrs[i] != regData[i].recSendbuff) {
|
||||
if ((regData[i * 2].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2].reg.caddrs[i] != regData[i * 2].reg.addr) {
|
||||
sendNeedReg = true;
|
||||
}
|
||||
|
||||
if (regData[i].recRecvbuff == 0 || recvRegRecord->addrs[i] != regData[i].recRecvbuff) {
|
||||
if ((regData[i * 2 + 1].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2 + 1].reg.caddrs[i] != regData[i * 2 + 1].reg.addr) {
|
||||
recvNeedReg = true;
|
||||
}
|
||||
|
||||
if ((regData[i * 2].reg.state & NVLS_REG_NO_SUPPORT) || (regData[i * 2 + 1].reg.state & NVLS_REG_NO_SUPPORT)) {
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
if (sendNeedReg == false) {
|
||||
for (int i = 0; i < comm->localRanks - 1; ++i) {
|
||||
if (regData[i].recSendOffset != regData[i + 1].recSendOffset) {
|
||||
if (regData[i * 2].offset != regData[(i + 1) * 2].offset) {
|
||||
/* offset are different, we cannot apply user buffer registration */
|
||||
goto fail;
|
||||
}
|
||||
@@ -575,18 +579,18 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
|
||||
|
||||
/* reuse previous registered buffer if possible */
|
||||
if (!sendNeedReg)
|
||||
regSendPtr = (CUdeviceptr)((uintptr_t)sendRegRecord->regAddr + regData[comm->localRank].recSendOffset);
|
||||
regSendPtr = (CUdeviceptr)((uintptr_t)sendRegRecord->regAddr + regData[comm->localRank * 2].offset);
|
||||
}
|
||||
|
||||
if (recvNeedReg == false) {
|
||||
for (int i = 0; i < comm->localRanks - 1; ++i) {
|
||||
if (regData[i].recRecvOffset != regData[i + 1].recRecvOffset) {
|
||||
if (regData[i * 2 + 1].offset != regData[(i + 1) * 2 + 1].offset) {
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
if (!recvNeedReg)
|
||||
regRecvPtr = (CUdeviceptr)((uintptr_t)recvRegRecord->regAddr + regData[comm->localRank].recRecvOffset);
|
||||
regRecvPtr = (CUdeviceptr)((uintptr_t)recvRegRecord->regAddr + regData[comm->localRank * 2 + 1].offset);
|
||||
}
|
||||
|
||||
if ((!sendNeedReg || sendbuff == NULL) && (!recvNeedReg || recvbuff == NULL)) {
|
||||
@@ -597,29 +601,13 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
|
||||
|
||||
/* Start Registration. Not found registered buffers, then check whether both send and recv buffer locate
|
||||
* in register request cache. */
|
||||
NCCLCHECKGOTO(ncclCalloc(&reqData, comm->localRanks), ret, fail);
|
||||
if (sendNeedReg && sendbuff != NULL) {
|
||||
/* copy request data got from previous shmem AG */
|
||||
intptr_t offset = regData[0].reqSendOffset;
|
||||
for (int i = 0; i < comm->localRanks; ++i) {
|
||||
if (regData[i].reqSendbuff == 0 || offset != regData[i].reqSendOffset) goto fail;
|
||||
reqData[i].reqBuff = regData[i].reqSendbuff;
|
||||
reqData[i].reqSize = regData[i].reqSendSize;
|
||||
reqData[i].reqOffset = regData[i].reqSendOffset;
|
||||
}
|
||||
tryRegisterBuffer(comm, reqData, (uintptr_t)sendbuff, sendbuffSize, ®SendPtr, &localRegBufUsed);
|
||||
if (sendNeedReg && sendbuff) {
|
||||
tryRegisterBuffer(comm, (uintptr_t)sendbuff, sendbuffSize, ®SendPtr, &localRegBufUsed);
|
||||
if (localRegBufUsed == false) goto fail;
|
||||
}
|
||||
|
||||
if (recvNeedReg && recvbuff != NULL) {
|
||||
intptr_t offset = regData[0].reqRecvOffset;
|
||||
for (int i = 0; i < comm->localRanks; ++i) {
|
||||
if (regData[i].reqRecvbuff == 0 || offset != regData[i].reqRecvOffset) goto fail;
|
||||
reqData[i].reqBuff = regData[i].reqRecvbuff;
|
||||
reqData[i].reqSize = regData[i].reqRecvSize;
|
||||
reqData[i].reqOffset = regData[i].reqRecvOffset;
|
||||
}
|
||||
tryRegisterBuffer(comm, reqData, (uintptr_t)recvbuff, recvbuffSize, ®RecvPtr, &localRegBufUsed);
|
||||
if (recvNeedReg && recvbuff) {
|
||||
tryRegisterBuffer(comm, (uintptr_t)recvbuff, recvbuffSize, ®RecvPtr, &localRegBufUsed);
|
||||
if (localRegBufUsed == false) goto fail;
|
||||
}
|
||||
|
||||
@@ -630,7 +618,6 @@ exit:
|
||||
*outRegBufRecv = (void*)regRecvPtr;
|
||||
*outRegBufUsed = localRegBufUsed;
|
||||
free(regData);
|
||||
free(reqData);
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
localRegBufUsed = false;
|
||||
@@ -647,7 +634,7 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
|
||||
CUmulticastObjectProp prop;
|
||||
char shareableHandle[NVLS_HANDLE_SIZE];
|
||||
CUmemGenericAllocationHandle sendMcHandle, recvMcHandle;
|
||||
size_t sendGran, recvGran;
|
||||
size_t sendGran = 0, recvGran = 0;
|
||||
bool *regBufFlags = NULL;
|
||||
struct graphRegData *rdata = NULL;
|
||||
const void *baseSend = NULL;
|
||||
@@ -667,19 +654,17 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
|
||||
if (recvbuff != NULL)
|
||||
CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff), ret, fail);
|
||||
|
||||
memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp));
|
||||
prop.size = baseSendSize;
|
||||
CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
|
||||
prop.size = baseRecvSize;
|
||||
CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
|
||||
|
||||
localRegBufUsed = ((uint64_t)baseSend % sendGran != 0 || (uint64_t)baseRecv % recvGran != 0) ? false : true;
|
||||
localRegBufUsed = ((uint64_t)baseSend % comm->nvlsResources->ucGran != 0 || (uint64_t)baseRecv % comm->nvlsResources->ucGran != 0) ? false : true;
|
||||
regBufFlags[comm->localRank] = localRegBufUsed;
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regBufFlags, sizeof(bool)), ret, fail);
|
||||
for (int i = 0; i < comm->localRanks; ++i)
|
||||
if (regBufFlags[i] == false) goto fail;
|
||||
|
||||
memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp));
|
||||
if (sendbuff != NULL) {
|
||||
prop.size = baseSendSize;
|
||||
CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
|
||||
|
||||
/* check send buffer offset and size */
|
||||
rdata[comm->localRank].offset = (uintptr_t)sendbuff - (uintptr_t)baseSend;
|
||||
rdata[comm->localRank].size = baseSendSize;
|
||||
@@ -719,6 +704,9 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
|
||||
}
|
||||
|
||||
if (recvbuff != NULL) {
|
||||
prop.size = baseRecvSize;
|
||||
CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
|
||||
|
||||
rdata[comm->localRank].offset = (uintptr_t)recvbuff - (uintptr_t)baseRecv;
|
||||
rdata[comm->localRank].size = baseRecvSize;
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail);
|
||||
|
||||
@@ -113,6 +113,12 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
}
|
||||
#endif
|
||||
|
||||
// MNNVL support
|
||||
if (info1->hostHash != info2->hostHash) {
|
||||
NCCLCHECK(ncclTopoCheckMNNVL(topo, info1, info2, ret));
|
||||
if (*ret) return ncclSuccess;
|
||||
}
|
||||
|
||||
// Rule out different nodes / isolated containers
|
||||
if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) {
|
||||
*ret = 0;
|
||||
@@ -203,8 +209,9 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) {
|
||||
if (ncclCuMemEnable()) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
|
||||
// cuMem API support
|
||||
CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size));
|
||||
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||
@@ -244,18 +251,16 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
|
||||
#if CUDART_VERSION >= 11030
|
||||
// cuMem API support
|
||||
CUdeviceptr dptr = 0;
|
||||
CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
ncclCuDesc *cuDesc = &ipcDesc->cuDesc;
|
||||
|
||||
// Import and map the remote memory descriptor to the local GPU
|
||||
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||
// UDS fd support
|
||||
struct ncclProxyConnector proxyConn;
|
||||
int fd = -1;
|
||||
// Send cuMem handle to remote for conversion to an fd
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpPeer, &proxyConn));
|
||||
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, &proxyConn, &cuDesc->data, &fd));
|
||||
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpPeer, &cuDesc->data, &fd));
|
||||
INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, tpPeer);
|
||||
CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
|
||||
(void) close(fd);
|
||||
@@ -293,6 +298,8 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
|
||||
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
|
||||
NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0);
|
||||
|
||||
#define P2P_SAME_PID(MYINFO, PEERINFO) ((MYINFO->hostHash == PEERINFO->hostHash) && (MYINFO->pidHash == PEERINFO->pidHash))
|
||||
|
||||
static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
|
||||
int p2p;
|
||||
// Queries the topology to see if the GPUs are Ampere and
|
||||
@@ -305,7 +312,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
|
||||
}
|
||||
|
||||
static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
if (P2P_SAME_PID(myInfo, peerInfo)) {
|
||||
if (peerInfo->cudaDev != myInfo->cudaDev) {
|
||||
// Same PID different GPUs, enable P2P access
|
||||
// Legacy CUDA IPC
|
||||
@@ -333,15 +340,9 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
|
||||
*devMem = p2pBuff->directPtr;
|
||||
*ipcPtr = NULL;
|
||||
} else {
|
||||
if ((myInfo->pidHash == peerInfo->pidHash) && (peerInfo->cudaDev == myInfo->cudaDev)) {
|
||||
// Same PID and GPU
|
||||
*devMem = p2pBuff->directPtr;
|
||||
*ipcPtr = NULL;
|
||||
} else {
|
||||
// Different PID or different GPU
|
||||
NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
|
||||
*ipcPtr = *devMem;
|
||||
}
|
||||
// Different PID
|
||||
NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
|
||||
*ipcPtr = *devMem;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -382,7 +383,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
|
||||
if (intermediateRank == -1) {
|
||||
info->rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
|
||||
if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
|
||||
resources->type = P2P_DIRECT;
|
||||
send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
|
||||
@@ -391,8 +392,9 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
// cuMem API support
|
||||
if (ncclCuMemEnable()) {
|
||||
resources->type = P2P_CUMEM;
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%x] -> %d[%x] via P2P/CUMEM%s%s comm %p nRanks %02d",
|
||||
channelId, connIndex, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);;
|
||||
const char *MNNVL = comm->MNNVL ? "MNNVL" : "CUMEM";
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%x] -> %d[%x] via P2P/CUMEM%s%s%s comm %p nRanks %02d",
|
||||
channelId, connIndex, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, MNNVL, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);;
|
||||
} else {
|
||||
// Legacy CUDA IPC
|
||||
resources->type = P2P_IPC;
|
||||
@@ -446,7 +448,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
|
||||
if (intermediateRank == -1) {
|
||||
info->rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
|
||||
if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
|
||||
resources->type = P2P_DIRECT;
|
||||
recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
} else {
|
||||
@@ -496,7 +498,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
|
||||
if (useMemcpy) {
|
||||
send->conn.tail = &resources->proxyInfo.ceRecvMem->tail;
|
||||
send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
|
||||
send->conn.connFifo = resources->proxyInfo.ceRecvMem->connFifo;
|
||||
send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
|
||||
// Send SIMPLE buff to proxy, and replace it by local buffer
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
|
||||
@@ -744,11 +746,11 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru
|
||||
}
|
||||
if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
|
||||
volatile struct ncclConnFifo* connFifo = resources->ceRecvMem->connFifo;
|
||||
volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
|
||||
// Check GPU has sent everything
|
||||
if ((*recvTail > sub->base+sub->transmitted)) {
|
||||
int size = sizesFifo[buffSlot];
|
||||
int size = connFifo[buffSlot].size;
|
||||
CUDACHECK(cudaMemcpyAsync(resources->recvFifo+buffSlot*stepSize, resources->ceDevBuff+buffSlot*stepSize, size, cudaMemcpyDeviceToDevice, resources->stream));
|
||||
CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
|
||||
sub->transmitted += args->sliceSteps;
|
||||
@@ -793,4 +795,4 @@ static void initCeOperation() {
|
||||
}
|
||||
init = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -152,7 +152,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
send->conn.head = &resources->devHostMem->head;
|
||||
|
||||
if (useMemcpyRecv) {
|
||||
send->conn.sizesFifo = resources->devRemHostMem->sizesFifo;
|
||||
send->conn.connFifo = resources->devRemHostMem->connFifo;
|
||||
}
|
||||
if (useMemcpySend) {
|
||||
int tpProxyRank;
|
||||
@@ -162,7 +162,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
|
||||
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
|
||||
send->conn.tail = &proxyInfo.ceRecvMem->tail;
|
||||
send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
|
||||
send->conn.connFifo = proxyInfo.ceRecvMem->connFifo;
|
||||
}
|
||||
|
||||
// We must assign the proxyConn's proxyProgress property for proper checking at enqueue-time
|
||||
@@ -315,15 +315,15 @@ static ncclResult_t shmSendProxyProgress(struct ncclProxyState* proxyState, stru
|
||||
}
|
||||
if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
|
||||
volatile struct ncclConnFifo* connFifo = resources->ceRecvMem->connFifo;
|
||||
volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
|
||||
// Check GPU has sent everything
|
||||
if ((*recvTail > sub->base+sub->transmitted)) {
|
||||
int size = sizesFifo[buffSlot];
|
||||
int size = connFifo[buffSlot].size;
|
||||
CUDACHECK(cudaMemcpyAsync(resources->shmFifo+buffSlot*stepSize, resources->devFifo+buffSlot*stepSize, size, cudaMemcpyDeviceToHost, resources->stream));
|
||||
CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
|
||||
resources->recvMem->sizesFifo[buffSlot] = size;
|
||||
__sync_synchronize(); // make sure sizesFifo is visible
|
||||
resources->recvMem->connFifo[buffSlot].size = size;
|
||||
__sync_synchronize(); // make sure connFifo[].size is visible
|
||||
sub->transmitted += args->sliceSteps;
|
||||
}
|
||||
}
|
||||
@@ -374,11 +374,11 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru
|
||||
}
|
||||
if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->recvMem->sizesFifo;
|
||||
volatile struct ncclConnFifo* connFifo = resources->recvMem->connFifo;
|
||||
volatile uint64_t* recvTail = &resources->recvMem->tail;
|
||||
// Check data is ready in SHM
|
||||
if ((*recvTail > sub->base+sub->transmitted)) {
|
||||
int size = sizesFifo[buffSlot];
|
||||
int size = connFifo[buffSlot].size;
|
||||
CUDACHECK(cudaMemcpyAsync(resources->devFifo+buffSlot*stepSize, resources->shmFifo+buffSlot*stepSize, size, cudaMemcpyHostToDevice, resources->stream));
|
||||
CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
|
||||
sub->transmitted += args->sliceSteps;
|
||||
|
||||
@@ -26,7 +26,20 @@ if(BUILD_TESTS)
|
||||
include_directories(${GTEST_INCLUDE_DIRS} ./common)
|
||||
|
||||
# Collect testing framework source files
|
||||
set (COMMON_SOURCE_FILES
|
||||
set(TEST_SOURCE_FILES
|
||||
AllGatherTests.cpp
|
||||
AllReduceTests.cpp
|
||||
AllToAllTests.cpp
|
||||
AllToAllVTests.cpp
|
||||
BroadcastTests.cpp
|
||||
GatherTests.cpp
|
||||
GroupCallTests.cpp
|
||||
NonBlockingTests.cpp
|
||||
ReduceScatterTests.cpp
|
||||
ReduceTests.cpp
|
||||
ScatterTests.cpp
|
||||
SendRecvTests.cpp
|
||||
StandaloneTests.cpp
|
||||
common/main.cpp
|
||||
common/CollectiveArgs.cpp
|
||||
common/EnvVars.cpp
|
||||
@@ -36,38 +49,7 @@ if(BUILD_TESTS)
|
||||
common/TestBedChild.cpp
|
||||
)
|
||||
|
||||
# Collect source files for tests
|
||||
if(ONLY_FUNCS)
|
||||
# Convert input string to a list
|
||||
string(REPLACE "|" ";" CONFIG_LIST ${ONLY_FUNCS})
|
||||
|
||||
# For each config in config list
|
||||
foreach(item ${CONFIG_LIST})
|
||||
string(REPLACE " " ";" CONFIG_PARAMS ${item})
|
||||
list(GET CONFIG_PARAMS 0 COLL)
|
||||
|
||||
set(TEST_FILE "${COLL}Tests.cpp")
|
||||
list(APPEND TEST_SOURCE_FILES ${TEST_FILE})
|
||||
endforeach()
|
||||
else()
|
||||
set(TEST_SOURCE_FILES
|
||||
AllGatherTests.cpp
|
||||
AllReduceTests.cpp
|
||||
AllToAllTests.cpp
|
||||
AllToAllVTests.cpp
|
||||
BroadcastTests.cpp
|
||||
GatherTests.cpp
|
||||
GroupCallTests.cpp
|
||||
NonBlockingTests.cpp
|
||||
ReduceScatterTests.cpp
|
||||
ReduceTests.cpp
|
||||
ScatterTests.cpp
|
||||
SendRecvTests.cpp
|
||||
StandaloneTests.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
add_executable(rccl-UnitTests ${COMMON_SOURCE_FILES} ${TEST_SOURCE_FILES})
|
||||
add_executable(rccl-UnitTests ${TEST_SOURCE_FILES})
|
||||
|
||||
## Set rccl-UnitTests include directories
|
||||
target_include_directories(rccl-UnitTests PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS})
|
||||
|
||||
Ссылка в новой задаче
Block a user