2.20.3-1
Add support for alternating rings, allow for cross-nic rings without cross-rail communication. Add support for user buffer registration for network send/recv. Optimize aggregated operations to better utilize all channels. Add flattening for BCM PCI gen5 switches. Add support for inter-node NVLink communication Add support for port fusion in NET/IB. Add support for ReduceScatter and AllGather using Collnet. Update net API to v8. Fix hang during A2A connection.
Цей коміт міститься в:
+100
-26
@@ -15,15 +15,37 @@ __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess;
|
||||
|
||||
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v7_t* props) {
|
||||
//pluginPciPath(dev, &props.pciPath);
|
||||
//pluginPtrSupport(dev, &props.ptrSupport);
|
||||
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props) {
|
||||
// Below are default values, if unsure don't change.
|
||||
|
||||
props->name = "Example";
|
||||
// Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
|
||||
props->pciPath = NULL;
|
||||
// Only used to detect NICs with multiple PCI attachments.
|
||||
props->guid = 0;
|
||||
// Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
|
||||
props->ptrSupport = NCCL_PTR_HOST;
|
||||
// If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
|
||||
props->regIsGlobal = 0;
|
||||
// Speed in *Mbps*. 100000 means 100G
|
||||
props->speed = 100000;
|
||||
// Port number, used in conjunction with guid
|
||||
props->port = 0;
|
||||
// Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
|
||||
props->latency = 0;
|
||||
// Maximum number of comm objects we can create.
|
||||
props->maxComms = 1024*1024;
|
||||
// Maximum number of receive operations taken by irecv().
|
||||
props->maxRecvs = 1;
|
||||
// Coupling with NCCL network device-side code.
|
||||
props->netDeviceType = 0;
|
||||
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
|
||||
return ncclInternalError;
|
||||
}
|
||||
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
|
||||
__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; }
|
||||
@@ -38,7 +60,7 @@ __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_m
|
||||
|
||||
#define PLUGIN_NAME "Plugin"
|
||||
|
||||
const ncclNet_v7_t ncclNetPlugin_v7 = {
|
||||
const ncclNet_v8_t ncclNetPlugin_v8 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.devices = pluginDevices,
|
||||
@@ -60,10 +82,62 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
|
||||
.irecvConsumed = pluginIrecvConsumed,
|
||||
};
|
||||
|
||||
__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props) {
|
||||
//pluginPciPath(dev, &props.pciPath);
|
||||
//pluginPtrSupport(dev, &props.ptrSupport);
|
||||
return ncclInternalError;
|
||||
__hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) {
|
||||
ncclNetProperties_t props;
|
||||
ncclResult_t ret = pluginGetProperties(dev, &props);
|
||||
if (ret != ncclSuccess) return ret;
|
||||
props_v7->name = props.name;
|
||||
props_v7->pciPath = props.pciPath;
|
||||
props_v7->guid = props.guid;
|
||||
props_v7->ptrSupport = props.ptrSupport;
|
||||
props_v7->speed = props.speed;
|
||||
props_v7->port = props.port;
|
||||
props_v7->maxComms = props.maxComms;
|
||||
props_v7->maxRecvs = props.maxRecvs;
|
||||
props_v7->netDeviceType = props.netDeviceType;
|
||||
props_v7->netDeviceVersion = props.netDeviceVersion;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int type, void** mhandle) {
|
||||
return pluginRegMr(collComm, data, size, type, mhandle);
|
||||
}
|
||||
|
||||
const ncclNet_v7_t ncclNetPlugin_v7 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v7,
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect,
|
||||
.accept = pluginAccept,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.regMrDmaBuf = pluginRegMrDmaBuf,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend,
|
||||
.irecv = pluginIrecv,
|
||||
.iflush = pluginIflush,
|
||||
.test = pluginTest,
|
||||
.closeSend = pluginCloseSend,
|
||||
.closeRecv = pluginCloseRecv,
|
||||
.closeListen = pluginCloseListen,
|
||||
.getDeviceMr = pluginGetDeviceMr,
|
||||
.irecvConsumed = pluginIrecvConsumed,
|
||||
};
|
||||
|
||||
__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props_v6) {
|
||||
ncclNetProperties_t props;
|
||||
ncclResult_t ret = pluginGetProperties(dev, &props);
|
||||
if (ret != ncclSuccess) return ret;
|
||||
props_v6->name = props.name;
|
||||
props_v6->pciPath = props.pciPath;
|
||||
props_v6->guid = props.guid;
|
||||
props_v6->ptrSupport = props.ptrSupport;
|
||||
props_v6->speed = props.speed;
|
||||
props_v6->port = props.port;
|
||||
props_v6->maxComms = props.maxComms;
|
||||
props_v6->maxRecvs = props.maxRecvs;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; }
|
||||
@@ -77,7 +151,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v6,
|
||||
.accept = pluginAccept_v6,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.regMrDmaBuf = pluginRegMrDmaBuf,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend,
|
||||
@@ -98,7 +172,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v6,
|
||||
.accept = pluginAccept_v6,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend,
|
||||
.irecv = pluginIrecv,
|
||||
@@ -110,17 +184,17 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
|
||||
};
|
||||
|
||||
/* v4 Compat */
|
||||
static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) {
|
||||
ncclNetProperties_v6_t props_v6;
|
||||
ncclResult_t ret = pluginGetProperties_v6(dev, &props_v6);
|
||||
static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props_v4) {
|
||||
ncclNetProperties_t props;
|
||||
ncclResult_t ret = pluginGetProperties(dev, &props);
|
||||
if (ret != ncclSuccess) return ret;
|
||||
props->name = props_v6.name;
|
||||
props->pciPath = props_v6.pciPath;
|
||||
props->guid = props_v6.guid;
|
||||
props->ptrSupport = props_v6.ptrSupport;
|
||||
props->speed = props_v6.speed;
|
||||
props->port = props_v6.port;
|
||||
props->maxComms = props_v6.maxComms;
|
||||
props_v4->name = props.name;
|
||||
props_v4->pciPath = props.pciPath;
|
||||
props_v4->guid = props.guid;
|
||||
props_v4->ptrSupport = props.ptrSupport;
|
||||
props_v4->speed = props.speed;
|
||||
props_v4->port = props.port;
|
||||
props_v4->maxComms = props.maxComms;
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
|
||||
@@ -157,7 +231,7 @@ const ncclNet_v4_t ncclNetPlugin_v4 = {
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v4,
|
||||
.accept = pluginAccept_v4,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend_v4,
|
||||
.irecv = pluginIrecv_v4,
|
||||
@@ -202,7 +276,7 @@ const ncclNet_v3_t ncclNetPlugin_v3 = {
|
||||
.listen = pluginListen_v3,
|
||||
.connect = pluginConnect_v3,
|
||||
.accept = pluginAccept_v4,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend_v4,
|
||||
.irecv = pluginIrecv_v4,
|
||||
@@ -223,7 +297,7 @@ const ncclNet_v2_t ncclNetPlugin_v2 = {
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v4,
|
||||
.accept = pluginAccept_v4,
|
||||
.regMr = pluginRegMr,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend_v4,
|
||||
.irecv = pluginIrecv_v4,
|
||||
|
||||
Посилання в новій задачі
Заблокувати користувача