2.21.5-1
Add support for IB SHARP 1PPN operation with user buffers. Improve support for MNNVL, add NVLS support and multi-clique support. * Detect the NVLS clique through NVML * Exchange XML between peers in the same NVLS clique and fuse XMLs before creating the topology graph. * Rework bootstrap allgather algorithms to allow for large allgather operations intra-node (XML exchange). Net/IB: add support for dynamic GID detection. * Automatically select RoCEv2/IPv4 interface by default. Allow to select IPv6 or even the network/mask. Reduce NVLS memory usage. * Add stepSize as property of a connection to allow for different sizes on different peers; set it to 128K for NVLink SHARP. Improve tuner loading * Look for more paths, be more consistent with the network device plugin. * Also search for tuner support inside the net plugin. Improve tuner API * Add context to support multi-device per process. Add magic number around comm object to detect comm corruption. * Add some basic check around communicators so that we can report a problem when a communicator gets corrupted or a wrong comm pointer is passed to NCCL. Fix net/IB error path. Github PR #1164 Fix collnet rail mapping with split comm. Fix packet reordering issue causing bootstrap mismatch * Use a different tag in ncclTransportP2pSetup for the connectInfo exchange and the following barrier. Fix hang when crossNic is inconsistent between ranks. Fix minCompCap/maxCompCap computation. Github issue #1184
Este commit está contenido en:
@@ -7,8 +7,6 @@
|
||||
#ifndef NET_DEVICE_H_
|
||||
#define NET_DEVICE_H_
|
||||
|
||||
#include "net_device.h"
|
||||
|
||||
#define NCCL_NET_DEVICE_INVALID_VERSION 0x0
|
||||
#define NCCL_NET_MTU_SIZE 4096
|
||||
|
||||
|
||||
@@ -39,13 +39,17 @@ typedef struct {
|
||||
const char* name;
|
||||
|
||||
// Initializes tuner states.
|
||||
// nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// nNodes: number of nodes in current communicator.
|
||||
// logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
|
||||
// Inputs:
|
||||
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// - nNodes: number of nodes in current communicator.
|
||||
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
// Outputs:
|
||||
// - context: tuner context object
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
|
||||
|
||||
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
|
||||
// Inputs:
|
||||
// - context: tuner context object
|
||||
// - collType: collective type , e.g., allreduce, allgather…
|
||||
// - nBytes: collective size in bytes
|
||||
// - collNetSupport: whether collnet supports this type
|
||||
@@ -62,16 +66,17 @@ typedef struct {
|
||||
// Also, the plugin is allowed to not set any output, or set only the
|
||||
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int *algorithm, int *protocol, int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
ncclResult_t (*destroy)();
|
||||
} ncclTuner_v1_t;
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v2_t;
|
||||
|
||||
typedef ncclTuner_v1_t ncclTuner_t;
|
||||
typedef ncclTuner_v2_t ncclTuner_t;
|
||||
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
|
||||
|
||||
#endif
|
||||
|
||||
@@ -8,17 +8,17 @@
|
||||
|
||||
#define __hidden __attribute__ ((visibility("hidden")))
|
||||
|
||||
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) { return ncclSuccess; }
|
||||
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
|
||||
|
||||
__hidden ncclResult_t pluginGetCollInfo(ncclFunc_t collType, size_t nBytes,
|
||||
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
|
||||
|
||||
__hidden ncclResult_t pluginDestroy() { return ncclSuccess; }
|
||||
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
|
||||
|
||||
#define PLUGIN_NAME "Example"
|
||||
|
||||
const ncclTuner_v1_t ncclTunerPlugin_v1 = {
|
||||
const ncclTuner_v2_t ncclTunerPlugin_v2 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.getCollInfo = pluginGetCollInfo,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 20
|
||||
NCCL_MINOR := 21
|
||||
NCCL_PATCH := 5
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
+148
-103
@@ -80,6 +80,16 @@ static ncclResult_t bootstrapNetRecv(struct ncclSocket* sock, void* data, int si
|
||||
NCCLCHECK(ncclSocketRecv(sock, data, std::min(recvSize, size)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t bootstrapNetSendRecv(struct ncclSocket* sendSock, void* sendData, int sendSize, struct ncclSocket* recvSock, void* recvData, int recvSize) {
|
||||
int senderRecvSize;
|
||||
NCCLCHECK(ncclSocketSendRecv(sendSock, &sendSize, sizeof(int), recvSock, &senderRecvSize, sizeof(int)));
|
||||
if (senderRecvSize > recvSize) {
|
||||
WARN("Message truncated : received %d bytes instead of %d", senderRecvSize, recvSize);
|
||||
return ncclInternalError;
|
||||
}
|
||||
NCCLCHECK(ncclSocketSendRecv(sendSock, sendData, sendSize, recvSock, recvData, recvSize));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct extInfo {
|
||||
int rank;
|
||||
@@ -390,103 +400,40 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
||||
// Bootstrap send/receive functions
|
||||
//
|
||||
// We do not keep connections opened with all ranks at all times, and we have no guarantee
|
||||
// that connections to our unique listen socket will arrive in the same order as we need
|
||||
// them. Therefore, when establishing a connection, the sender sends a (peer, tag) tuple to
|
||||
// allow the receiver to identify the flow, and keep it in an unexpected queue if needed.
|
||||
|
||||
ncclResult_t bootstrapConnect(void* commState, int peer, int tag, struct ncclSocket* sock) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
char* data = (char*)allData;
|
||||
int rank = state->rank;
|
||||
int nranks = state->nranks;
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
|
||||
|
||||
/* Simple ring based AllGather
|
||||
* At each step i receive data from (rank-i-1) from left
|
||||
* and send previous step's data from (rank-i) to right
|
||||
*/
|
||||
for (int i=0; i<nranks-1; i++) {
|
||||
size_t rslice = (rank - i - 1 + nranks) % nranks;
|
||||
size_t sslice = (rank - i + nranks) % nranks;
|
||||
|
||||
// Send slice to the right
|
||||
NCCLCHECK(bootstrapNetSend(&state->ringSendSocket, data+sslice*size, size));
|
||||
// Recv slice from the left
|
||||
NCCLCHECK(bootstrapNetRecv(&state->ringRecvSocket, data+rslice*size, size));
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
|
||||
NCCLCHECKGOTO(ncclSocketInit(sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketConnect(sock), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(sock, &state->rank, sizeof(int)), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(sock, &tag, sizeof(int)), ret, fail);
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
NCCLCHECK(ncclSocketClose(sock));
|
||||
return ret;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
struct ncclSocket sock;
|
||||
|
||||
NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketConnect(&sock), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, fail);
|
||||
TRACE(NCCL_BOOTSTRAP, "Sending to peer=%d tag=%d size=%d", peer, tag, size);
|
||||
NCCLCHECK(bootstrapConnect(commState, peer, tag, &sock));
|
||||
NCCLCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, exit);
|
||||
|
||||
TRACE(NCCL_BOOTSTRAP, "Sent to peer=%d tag=%d size=%d", peer, tag, size);
|
||||
|
||||
exit:
|
||||
NCCLCHECK(ncclSocketClose(&sock));
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag) {
|
||||
if (nranks == 1) return ncclSuccess;
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag);
|
||||
|
||||
/* Simple intra process barrier
|
||||
*
|
||||
* Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
|
||||
* "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
|
||||
*/
|
||||
int data[1];
|
||||
for (int mask=1; mask<nranks; mask<<=1) {
|
||||
int src = (rank - mask + nranks) % nranks;
|
||||
int dst = (rank + mask) % nranks;
|
||||
NCCLCHECK(bootstrapSend(commState, ranks[dst], tag, data, sizeof(data)));
|
||||
NCCLCHECK(bootstrapRecv(commState, ranks[src], tag, data, sizeof(data)));
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d tag %x - DONE", rank, nranks, tag);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size) {
|
||||
if (nranks == 1) return ncclSuccess;
|
||||
char* data = (char*)allData;
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size);
|
||||
|
||||
for (int i=1; i<nranks; i++) {
|
||||
int src = (rank - i + nranks) % nranks;
|
||||
int dst = (rank + i) % nranks;
|
||||
NCCLCHECK(bootstrapSend(commState, ranks[dst], /*tag=*/i, data+rank*size, size));
|
||||
NCCLCHECK(bootstrapRecv(commState, ranks[src], /*tag=*/i, data+src*size, size));
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// IntraNode in-place Broadcast
|
||||
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size) {
|
||||
if (nranks == 1) return ncclSuccess;
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - ENTER", rank, nranks, root, size);
|
||||
|
||||
if (rank == root) {
|
||||
for (int i=0; i<nranks; i++) {
|
||||
if (i != root) NCCLCHECK(bootstrapSend(commState, ranks[i], /*tag=*/ranks[i], bcastData, size));
|
||||
}
|
||||
}
|
||||
else {
|
||||
NCCLCHECK(bootstrapRecv(commState, ranks[root], /*tag=*/ranks[rank], bcastData, size));
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
|
||||
@@ -543,38 +490,136 @@ static void unexpectedFree(struct bootstrapState* state) {
|
||||
}
|
||||
|
||||
// We can't know who we'll receive from, so we need to receive everything at once
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
|
||||
ncclResult_t bootstrapAccept(void* commState, int peer, int tag, struct ncclSocket* sock) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
struct ncclSocket sock;
|
||||
int newPeer, newTag;
|
||||
|
||||
// Search unexpected connections first
|
||||
int found;
|
||||
NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock, &found));
|
||||
if (found) {
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
|
||||
goto exit;
|
||||
}
|
||||
NCCLCHECK(unexpectedDequeue(state, peer, tag, sock, &found));
|
||||
if (found) return ncclSuccess;
|
||||
|
||||
// Then look for new connections
|
||||
while (1) {
|
||||
NCCLCHECKGOTO(ncclSocketInit(&sock), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketAccept(&sock, &state->listenSock), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail);
|
||||
if (newPeer == peer && newTag == tag) {
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
|
||||
goto exit;
|
||||
}
|
||||
// Unexpected connection. Save for later.
|
||||
NCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketInit(sock), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketAccept(sock, &state->listenSock), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(sock, &newPeer, sizeof(int)), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(sock, &newTag, sizeof(int)), ret, fail);
|
||||
if (newPeer == peer && newTag == tag) return ncclSuccess;
|
||||
NCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, sock), ret, fail);
|
||||
}
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
NCCLCHECK(ncclSocketClose(sock));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// We can't know who we'll receive from, so we need to receive everything at once
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
|
||||
ncclResult_t ret;
|
||||
struct ncclSocket sock;
|
||||
NCCLCHECK(bootstrapAccept(commState, peer, tag, &sock));
|
||||
TRACE(NCCL_BOOTSTRAP, "Receiving tag=%d peer=%d size=%d", tag, peer, size);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, exit);
|
||||
exit:
|
||||
NCCLCHECK(ncclSocketClose(&sock));
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Collective algorithms, based on bootstrapSend/Recv, and sometimes bootstrapConnect/Accept
|
||||
|
||||
ncclResult_t bootstrapRingAllGather(struct ncclSocket* prevSocket, struct ncclSocket* nextSocket, int rank, int nranks, char* data, int size) {
|
||||
/* Simple ring based AllGather
|
||||
* At each step i receive data from (rank-i-1) from prev
|
||||
* and send previous step's data from (rank-i) to next
|
||||
*/
|
||||
for (int i=0; i<nranks-1; i++) {
|
||||
size_t rslice = (rank - i - 1 + nranks) % nranks;
|
||||
size_t sslice = (rank - i + nranks) % nranks;
|
||||
|
||||
// Send slice to the right, recv slice from the left
|
||||
NCCLCHECK(bootstrapNetSendRecv(nextSocket, data+sslice*size, size, prevSocket, data+rslice*size, size));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
int rank = state->rank;
|
||||
int nranks = state->nranks;
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
|
||||
|
||||
NCCLCHECK(bootstrapRingAllGather(&state->ringRecvSocket, &state->ringSendSocket, rank, nranks, (char*)allData, size));
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapIntraNodeBarrier(void* commState, int *ranks, int rank, int nranks, int tag) {
|
||||
if (nranks == 1) return ncclSuccess;
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag);
|
||||
|
||||
/* Simple [intra] process barrier
|
||||
*
|
||||
* Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
|
||||
* "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
|
||||
*/
|
||||
int data[1];
|
||||
for (int mask=1; mask<nranks; mask<<=1) {
|
||||
int src = (rank - mask + nranks) % nranks;
|
||||
int dst = (rank + mask) % nranks;
|
||||
NCCLCHECK(bootstrapSend(commState, ranks ? ranks[dst] : dst, tag, data, sizeof(data)));
|
||||
NCCLCHECK(bootstrapRecv(commState, ranks ? ranks[src] : src, tag, data, sizeof(data)));
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d tag %x - DONE", rank, nranks, tag);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapBarrier(void* commState, int rank, int nranks, int tag) {
|
||||
return bootstrapIntraNodeBarrier(commState, NULL, rank, nranks, tag);
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size) {
|
||||
if (nranks == 1) return ncclSuccess;
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size);
|
||||
|
||||
int prevRank = ranks[(rank - 1 + nranks)%nranks];
|
||||
int nextRank = ranks[(rank + 1) % nranks];
|
||||
struct ncclSocket prevSocket, nextSocket;
|
||||
NCCLCHECK(bootstrapConnect(commState, nextRank, 0, &nextSocket));
|
||||
NCCLCHECK(bootstrapAccept(commState, prevRank, 0, &prevSocket));
|
||||
|
||||
NCCLCHECK(bootstrapRingAllGather(&prevSocket, &nextSocket, rank, nranks, (char*)allData, size));
|
||||
|
||||
NCCLCHECK(ncclSocketClose(&nextSocket));
|
||||
NCCLCHECK(ncclSocketClose(&prevSocket));
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// [IntraNode] in-place Broadcast
|
||||
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size) {
|
||||
if (nranks == 1) return ncclSuccess;
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - ENTER", rank, nranks, root, size);
|
||||
|
||||
if (rank == root) {
|
||||
for (int i=0; i<nranks; i++) {
|
||||
if (i != root) NCCLCHECK(bootstrapSend(commState, ranks ? ranks[i] : i, /*tag=*/ranks ? ranks[i] : i, bcastData, size));
|
||||
}
|
||||
}
|
||||
else {
|
||||
NCCLCHECK(bootstrapRecv(commState, ranks ? ranks[root] : root, /*tag=*/ranks ? ranks[rank] : rank, bcastData, size));
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapBroadcast(void* commState, int rank, int nranks, int root, void* bcastData, int size) {
|
||||
return bootstrapIntraNodeBroadcast(commState, NULL, rank, nranks, root, bcastData, size);
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapClose(void* commState) {
|
||||
|
||||
+7
-5
@@ -13,7 +13,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
|
||||
if (channel->id != -1) return ncclSuccess;
|
||||
|
||||
int nRanks = comm->nRanks;
|
||||
int nPeers = nRanks + 1 /* Collnet */ + comm->localRanks /* NVLS */;
|
||||
int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
|
||||
int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */;
|
||||
channel->id = channelId;
|
||||
channel->workFifoSent = 0;
|
||||
|
||||
@@ -73,10 +74,11 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
|
||||
|
||||
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
|
||||
|
||||
int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
|
||||
if (share) {
|
||||
channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
|
||||
channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;
|
||||
for (int r = 0; r < comm->localRanks; ++r) {
|
||||
for (int r = 0; r < nvlsRanks; ++r) {
|
||||
int tr = comm->topParentLocalRanks[r];
|
||||
uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr);
|
||||
channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr;
|
||||
@@ -85,9 +87,9 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
|
||||
ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount);
|
||||
}
|
||||
} else {
|
||||
NCCLCHECK(ncclCalloc(&channel->nvlsPeers, comm->localRanks));
|
||||
NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, comm->localRanks, sharedRes->deviceStream.cudaStream));
|
||||
for (int r = 0; r < comm->localRanks; ++r) {
|
||||
NCCLCHECK(ncclCalloc(&channel->nvlsPeers, nvlsRanks));
|
||||
NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, sharedRes->deviceStream.cudaStream));
|
||||
for (int r = 0; r < nvlsRanks; ++r) {
|
||||
uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r);
|
||||
channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r;
|
||||
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
|
||||
|
||||
+16
-8
@@ -23,7 +23,8 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun
|
||||
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
|
||||
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
NCCLCHECK(ncclEnqueueCheck(&info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
@@ -46,7 +47,8 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
|
||||
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
|
||||
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
NCCLCHECK(ncclEnqueueCheck(&info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
@@ -67,14 +69,16 @@ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, n
|
||||
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
NCCLCHECK(ncclEnqueueCheck(&info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
/* Deprecated original "in place" function, similar to MPI */
|
||||
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, cudaStream_t stream);
|
||||
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, cudaStream_t stream) {
|
||||
return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
|
||||
NCCLCHECK(ncclBroadcast(buff, buff, count, datatype, root, comm, stream));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
@@ -98,7 +102,8 @@ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
struct ncclInfo info = { ncclFuncReduce, "Reduce",
|
||||
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
|
||||
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
NCCLCHECK(ncclEnqueueCheck(&info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
@@ -120,7 +125,8 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv
|
||||
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
|
||||
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
|
||||
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
NCCLCHECK(ncclEnqueueCheck(&info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct NvtxParamsSendRecv {
|
||||
@@ -144,7 +150,8 @@ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatyp
|
||||
1, 1 };
|
||||
ncclResult_t ret;
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
ret = ncclEnqueueCheck(&info);
|
||||
NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
|
||||
exit:
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ret;
|
||||
}
|
||||
@@ -161,7 +168,8 @@ ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int
|
||||
1, 1 };
|
||||
ncclResult_t ret;
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
ret = ncclEnqueueCheck(&info);
|
||||
NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
|
||||
exit:
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -79,6 +79,10 @@ void ncclDebugInit() {
|
||||
mask = NCCL_PROXY;
|
||||
} else if (strcasecmp(subsys, "NVLS") == 0) {
|
||||
mask = NCCL_NVLS;
|
||||
} else if (strcasecmp(subsys, "BOOTSTRAP") == 0) {
|
||||
mask = NCCL_BOOTSTRAP;
|
||||
} else if (strcasecmp(subsys, "REG") == 0) {
|
||||
mask = NCCL_REG;
|
||||
} else if (strcasecmp(subsys, "ALL") == 0) {
|
||||
mask = NCCL_ALL;
|
||||
}
|
||||
|
||||
+38
-22
@@ -253,18 +253,26 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
|
||||
int tn = nWarps1*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 1: send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, nullptr, &direct->out, args->sendbuff, nullptr,
|
||||
/*redOpArg=*/0, 0*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
ssize_t railAllBeg = railGridOffset + args->bid*chunkSize;
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railOneBeg = ncclShmem.comm.node*sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
ssize_t beg = max(railAllBeg, railOneBeg);
|
||||
ssize_t end = min(railAllEnd, railOneEnd);
|
||||
prims.send(beg-railOneBeg, max(ssize_t(0), end-beg));
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (tid == 0) {
|
||||
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
|
||||
}
|
||||
__syncwarp();
|
||||
} else {
|
||||
// Phase 1: send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, nullptr, &direct->out, args->sendbuff, nullptr,
|
||||
/*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
|
||||
ssize_t railAllBeg = railGridOffset + args->bid * chunkSize;
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
|
||||
ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
ssize_t beg = max(railAllBeg, railOneBeg);
|
||||
ssize_t end = min(railAllEnd, railOneEnd);
|
||||
prims.send(beg - railOneBeg, max(ssize_t(0), end - beg));
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -272,16 +280,24 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
|
||||
tn = nWarps2*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 2: Recv network -> deposit output + send to bcast
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, &direct->out, direct->heads+1, nullptr, nullptr,
|
||||
/*redOpArg=*/0, 1*Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*BcastSendNotRecv=*/true> scat;
|
||||
scat.args = args;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (tid == 0) {
|
||||
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
|
||||
}
|
||||
__syncwarp();
|
||||
} else {
|
||||
// Phase 2: Recv network -> deposit output + send to bcast
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, &direct->out, direct->heads + 1, nullptr, nullptr,
|
||||
/*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
|
||||
Scatterer</*BcastSendNotRecv=*/true> scat;
|
||||
scat.args = args;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
+72
-29
@@ -297,13 +297,21 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
}
|
||||
} else {
|
||||
// Directly send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (tid == tidStartReduce) {
|
||||
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
|
||||
}
|
||||
__syncwarp();
|
||||
} else {
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.send(offset, nelem);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (tid < tidStartBcast && hasUp) {
|
||||
@@ -328,14 +336,22 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
|
||||
prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
|
||||
}
|
||||
} else {
|
||||
// Recv from network (no post thread needed)
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.recv(offset, nelem, /*postOp=*/true);
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (tid == tidStartBcast) {
|
||||
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
|
||||
}
|
||||
__syncwarp();
|
||||
} else {
|
||||
// Recv from network (no post thread needed)
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.recv(offset, nelem, /*postOp=*/true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -616,21 +632,31 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
|
||||
groupNthreads = nthreads-nthreadsSplit;
|
||||
}
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, group*Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
|
||||
if (tid < nthreadsSplit) {
|
||||
if (recv == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.send(offset, nelem);
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (groupTid == 0) {
|
||||
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, steps);
|
||||
}
|
||||
__syncwarp();
|
||||
} else {
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
@@ -639,19 +665,36 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
|
||||
if (recv == nranks) {
|
||||
// I'm the first in the broadcast chain, I need to perform the division (postOp)
|
||||
if (send == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.recv(offset, nelem, /*postOp*/true);
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (groupTid == 0) {
|
||||
int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, steps);
|
||||
}
|
||||
__syncwarp();
|
||||
} else {
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.recv(offset, nelem, /*postOp*/true);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.recvCopyDirectSend(offset, nelem, /*postOp*/true);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
|
||||
args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
if (send == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
|
||||
@@ -18,19 +18,21 @@ typedef void(*ncclDevFuncPtr_t)();
|
||||
extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[];
|
||||
|
||||
struct ncclShmemGroup {
|
||||
ncclConnInfo *recvConns[NCCL_MAX_NVLS_ARITY];
|
||||
ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
|
||||
void* srcs[NCCL_MAX_NVLS_ARITY+1];
|
||||
void* dsts[NCCL_MAX_NVLS_ARITY+1];
|
||||
ncclConnInfo *recvConns[NCCL_MAX_ARITY];
|
||||
ncclConnInfo *sendConns[NCCL_MAX_ARITY];
|
||||
void* userInput;
|
||||
void* userOutput;
|
||||
void* srcs[NCCL_MAX_ARITY+1];
|
||||
void* dsts[NCCL_MAX_ARITY+1];
|
||||
union {
|
||||
unpackGroupShmem unpack;
|
||||
} devicePlugin;
|
||||
int32_t dstSizes[NCCL_MAX_NVLS_ARITY+1];
|
||||
int32_t dstSizes[NCCL_MAX_ARITY+1];
|
||||
};
|
||||
|
||||
struct ncclShmemData {
|
||||
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
|
||||
uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
|
||||
uint64_t redOpArgs[NCCL_MAX_ARITY+1];
|
||||
int channelId;
|
||||
int aborted;
|
||||
alignas(16) struct ncclDevComm comm;
|
||||
|
||||
+64
-47
@@ -5,6 +5,7 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "network/unpack/unpack.h"
|
||||
#include <cassert>
|
||||
|
||||
template<typename T, typename RedOp, typename Fan, int Direct,
|
||||
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
|
||||
@@ -13,9 +14,7 @@ class Primitives<
|
||||
> {
|
||||
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
|
||||
static constexpr int Input=0, Output=1;
|
||||
static constexpr int RoleInput = 0x01,
|
||||
RoleOutput = 0x02,
|
||||
RoleWaitRecv = 0x04,
|
||||
static constexpr int RoleWaitRecv = 0x04, // 0x1 0x2 are free to use
|
||||
RoleWaitSend = 0x08,
|
||||
RolePostSend = 0x10,
|
||||
RolePostRecv = 0x20,
|
||||
@@ -40,13 +39,11 @@ class Primitives<
|
||||
int group;
|
||||
uint64_t step;
|
||||
struct ncclConnFifo* connFifo = NULL;
|
||||
union {
|
||||
T *userBuff; // (flags & (RoleInput|RoleOutput))
|
||||
T *connEltsFifo; // !(flags & (RoleInput|RoleOutput))
|
||||
};
|
||||
T *directBuff;
|
||||
T* connEltsFifo;
|
||||
T* directBuff;
|
||||
uint64_t *connStepPtr;
|
||||
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
|
||||
int connStepSize; // Connection step size
|
||||
void* mhandle;
|
||||
void* netDeviceHandle;
|
||||
|
||||
@@ -153,7 +150,7 @@ class Primitives<
|
||||
} else if (flags & DirectRead) { // empty send
|
||||
ptrs[index] = nullptr;
|
||||
} else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
|
||||
}
|
||||
} else if (!isSendNotRecv && DirectRecv) {
|
||||
if (flags & (DirectRead | NvlsDirectRead)) {
|
||||
@@ -161,11 +158,11 @@ class Primitives<
|
||||
} else if (flags & DirectWrite) {
|
||||
ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer
|
||||
} else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
|
||||
}
|
||||
}
|
||||
else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
|
||||
}
|
||||
if ((flags & (AnyNetDeviceUnpack)) && (flags & (Recv*RoleWaitRecv))) {
|
||||
ncclNetDeviceIncrementHead(group);
|
||||
@@ -232,10 +229,12 @@ class Primitives<
|
||||
#endif
|
||||
do {
|
||||
sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
|
||||
if (Src && (flags & (SrcBuf==Input ? RoleInput : RoleOutput)))
|
||||
ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset;
|
||||
if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
|
||||
ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
|
||||
if (tid == 0) {
|
||||
T* userInput = (T*)ncclShmem.groups[group].userInput;
|
||||
T* userOutput = (T*)ncclShmem.groups[group].userOutput;
|
||||
if (Src) ncclShmem.groups[group].srcs[0] = (SrcBuf==Input ? userInput : userOutput) + srcIx + offset;
|
||||
if (Dst) ncclShmem.groups[group].dsts[0] = (DstBuf==Input ? userInput : userOutput) + dstIx + offset;
|
||||
}
|
||||
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(srcIx, dstIx, offset, sliceSize);
|
||||
subBarrier();
|
||||
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
|
||||
@@ -303,6 +302,28 @@ class Primitives<
|
||||
}
|
||||
|
||||
public:
|
||||
static inline __device__ void sendPeerNotify(int peer, int connIndex, int steps) {
|
||||
ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer];
|
||||
peerPtr->send[connIndex].step += steps;
|
||||
st_relaxed_sys_global(peerPtr->send[connIndex].tail, peerPtr->send[connIndex].step);
|
||||
}
|
||||
|
||||
static inline __device__ void recvPeerNotify(int peer, int connIndex, int steps) {
|
||||
int spins = 0;
|
||||
ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer];
|
||||
peerPtr->recv[connIndex].step += steps;
|
||||
st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step);
|
||||
while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) {
|
||||
if (spins++ == NCCL_SPINS_BEFORE_CHECK_ABORT) {
|
||||
if (*ncclShmem.comm.abortFlag) {
|
||||
ncclShmem.aborted = 1;
|
||||
break;
|
||||
}
|
||||
spins = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int Recv, int Send, typename Fn>
|
||||
__device__ __forceinline__ void process(Fn &&fn) {
|
||||
#pragma unroll 1
|
||||
@@ -371,7 +392,7 @@ private:
|
||||
if (Send) {
|
||||
// Scatter pre-scales data of input buffer only in non-Direct case
|
||||
constexpr int PreOpSrcs = DirectSend ? 0 : 1;
|
||||
if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
|
||||
if (tid==0) ncclShmem.groups[group].srcs[0] = (T*)ncclShmem.groups[group].userInput + inpIx + offset;
|
||||
// realSize is not accurate here; but intra-node does not rely on sizes FIFO
|
||||
waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
|
||||
subBarrier();
|
||||
@@ -391,7 +412,7 @@ private:
|
||||
}
|
||||
}
|
||||
} else if (Recv) {
|
||||
if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
|
||||
if (tid==0) ncclShmem.groups[group].dsts[0] = (T*)ncclShmem.groups[group].userOutput + outIx + offset;
|
||||
ssize_t pOffset = index*peerOffset;
|
||||
if (skip >= 0 && index >= skip) pOffset += peerElem;
|
||||
// Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
|
||||
@@ -436,6 +457,7 @@ private:
|
||||
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
|
||||
connStepPtr = conn->tail;
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
connStepSize = conn->stepSize/sizeof(T);
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
if (conn->connFifo != nullptr) {
|
||||
flags |= ConnFifoEnabled;
|
||||
@@ -484,6 +506,7 @@ private:
|
||||
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
|
||||
connStepPtr = conn->head;
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
connStepSize = conn->stepSize/sizeof(T);
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
if (connFifo == nullptr && Direct) {
|
||||
// User buffers have been registered
|
||||
@@ -528,24 +551,19 @@ private:
|
||||
while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++;
|
||||
this->fan = Fan(nrecv, nsend);
|
||||
|
||||
constexpr int ThreadPerSync = 8;
|
||||
constexpr int ThreadPerSync =
|
||||
MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups
|
||||
MaxSend >= 8 || MaxRecv >= 8 ? 16 :
|
||||
8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp
|
||||
static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
|
||||
|
||||
int g = tid / ThreadPerSync;
|
||||
int ng = nthreads / ThreadPerSync;
|
||||
index = tid % ThreadPerSync;
|
||||
index = -1;
|
||||
flags = 0;
|
||||
if (g == 0) {
|
||||
if (index < nrecv) flags |= RoleWaitRecv;
|
||||
if (index == nrecv) flags |= RoleInput;
|
||||
} else if (g == 1) {
|
||||
if (index < nsend) flags |= RoleWaitSend;
|
||||
if (index == nsend) flags |= RoleOutput;
|
||||
} else if (g == ng - 2) {
|
||||
if (index < nrecv) flags |= RolePostRecv;
|
||||
} else if (g == ng - 1) {
|
||||
if (index < nsend) flags |= RolePostSend;
|
||||
}
|
||||
assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role.
|
||||
if (tid < nrecv) { flags |= RoleWaitRecv; index = tid; }
|
||||
else if (tid < nrecv+nsend) { flags |= RoleWaitSend; index = tid-nrecv; }
|
||||
else if (nthreads-nsend <= tid) { flags |= RolePostSend; index = tid-(nthreads-nsend); }
|
||||
else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); }
|
||||
|
||||
int peer = 0;
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
|
||||
@@ -558,15 +576,11 @@ private:
|
||||
|
||||
if (barrierAny(flags & NetDeviceUnpack)) {
|
||||
flags |= AnyNetDeviceUnpack;
|
||||
// g == 0 is the first ThreadPerSync # of threads of this warp
|
||||
// g == 0 is also the RoleWaitRecv threads of this group, thus the thread ID will correlate to the peer index
|
||||
if (g == 0) {
|
||||
uint32_t mask = __ballot_sync((1U << ThreadPerSync) - 1, (flags & NetDeviceUnpack) ? 1 : 0);
|
||||
|
||||
// We only want to update the shared memory variable with a single thread
|
||||
if (tid == 0) {
|
||||
ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
|
||||
}
|
||||
// RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers
|
||||
// have NetDeviceUnpack.
|
||||
uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0);
|
||||
if (tid == 0) {
|
||||
ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -588,7 +602,8 @@ private:
|
||||
// was accessed directly.
|
||||
uint64_t prevStep = step - StepPerSlice;
|
||||
volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size);
|
||||
while (*ptr != -1);
|
||||
int spins = 0;
|
||||
while (*ptr != -1) if (checkAbort(spins)) break;
|
||||
}
|
||||
|
||||
if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) {
|
||||
@@ -601,11 +616,11 @@ private:
|
||||
}
|
||||
|
||||
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
|
||||
if (flags & RoleInput) {
|
||||
userBuff = (T*)inputBuf;
|
||||
if (tid==0) {
|
||||
ncclShmem.groups[group].userInput = (void*)inputBuf;
|
||||
ncclShmem.groups[group].userOutput = (void*)outputBuf;
|
||||
ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input
|
||||
}
|
||||
if (flags & RoleOutput) userBuff = (T*)outputBuf;
|
||||
bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
|
||||
bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
|
||||
bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
|
||||
@@ -696,8 +711,10 @@ private:
|
||||
}
|
||||
|
||||
__device__ void moveDataPtrs(intptr_t delta) {
|
||||
if (flags & (RoleInput|RoleOutput))
|
||||
userBuff += delta;
|
||||
if (tid==0) {
|
||||
ncclShmem.groups[group].userInput = (T*)ncclShmem.groups[group].userInput + delta;
|
||||
ncclShmem.groups[group].userOutput = (T*)ncclShmem.groups[group].userOutput + delta;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void send(intptr_t inpIx, int eltN) {
|
||||
|
||||
@@ -262,16 +262,24 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
|
||||
|
||||
tn = nWarps2*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 2: Reduce from peers + local input -> send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, direct->heads+1, &direct->out, nullptr, nullptr,
|
||||
args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*ReduceSendNotRecv=*/false> scat;
|
||||
scat.args = args;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (tid == 0) {
|
||||
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
|
||||
}
|
||||
__syncwarp();
|
||||
} else {
|
||||
// Phase 2: Reduce from peers + local input -> send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
|
||||
args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
|
||||
Scatterer</*ReduceSendNotRecv=*/false> scat;
|
||||
scat.args = args;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -279,18 +287,26 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
|
||||
|
||||
tn = nWarps3*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 3: recv from network
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, &direct->out, nullptr, nullptr, args->recvbuff,
|
||||
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
ssize_t railAllBeg = railGridOffset + args->bid*chunkSize;
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
|
||||
ssize_t railOneBeg = ncclShmem.comm.node*sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
ssize_t beg = max(railAllBeg, railOneBeg);
|
||||
ssize_t end = min(railAllEnd, railOneEnd);
|
||||
prims.recv(beg-railOneBeg, max(ssize_t(0), end-beg), /*postOp=*/true);
|
||||
if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
|
||||
if (tid == 0) {
|
||||
int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
|
||||
}
|
||||
__syncwarp();
|
||||
} else {
|
||||
// Phase 3: recv from network
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, &direct->out, nullptr, nullptr, args->recvbuff,
|
||||
args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
|
||||
ssize_t railAllBeg = railGridOffset + args->bid * chunkSize;
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
|
||||
ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
|
||||
ssize_t railOneEnd = railOneBeg + sizePerRank;
|
||||
ssize_t beg = max(railAllBeg, railOneBeg);
|
||||
ssize_t end = min(railAllEnd, railOneEnd);
|
||||
prims.recv(beg - railOneBeg, max(ssize_t(0), end - beg), /*postOp=*/true);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
+61
-10
@@ -680,6 +680,36 @@ static ncclResult_t registerIntraNodeBuffers(
|
||||
}
|
||||
}
|
||||
info->regBufType = NCCL_IPC_REG_BUFFER;
|
||||
} else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opFull.op != ncclDevPreMulSum && info->opFull.op != ncclDevSumPostDiv) {
|
||||
int sendRegBufFlag = 0;
|
||||
int recvRegBufFlag = 0;
|
||||
void *sendHandle, *recvHandle;
|
||||
|
||||
if (ncclParamLocalRegister()) {
|
||||
ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, info->sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle);
|
||||
info->sendMhandle = sendHandle;
|
||||
if (sendRegBufFlag) {
|
||||
ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, info->recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle);
|
||||
info->recvMhandle = recvHandle;
|
||||
}
|
||||
}
|
||||
|
||||
if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && plan->persistent && ncclParamGraphRegister()) {
|
||||
ncclCollnetGraphRegisterBuffer(comm, plan, info->sendbuff, info->sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle);
|
||||
info->sendMhandle = sendHandle;
|
||||
if (sendRegBufFlag) {
|
||||
ncclCollnetGraphRegisterBuffer(comm, plan, info->recvbuff, info->recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle);
|
||||
info->recvMhandle = recvHandle;
|
||||
}
|
||||
}
|
||||
|
||||
if (sendRegBufFlag && recvRegBufFlag) {
|
||||
info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 1));
|
||||
info->regBufType = NCCL_COLLNET_REG_BUFFER;
|
||||
if (sendRegBufFlag == 1 && recvRegBufFlag == 1) {
|
||||
INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, info->sendbuffSize, info->recvbuff, recvHandle, info->recvbuffSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
fallback:
|
||||
#endif
|
||||
@@ -806,7 +836,7 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
while (!ncclIntruQueueEmpty(&tasks->collCBDQueue)) {
|
||||
// Get nChannels and peek whether the budget allows before we enqueue
|
||||
collInfo = ncclIntruQueueHead(&tasks->collCBDQueue);
|
||||
collInfo->nChannels = DIVUP(collInfo->aggnBytes * tasks->usableChannels, totalCBDBytes);
|
||||
collInfo->nChannels = DIVUP(collInfo->workBytes * tasks->usableChannels, totalCBDBytes);
|
||||
// Haven't got nChannels info yet, relax the budget boundary a bit.
|
||||
if (*nWorkBudget < collInfo->nChannels) return ncclSuccess;
|
||||
|
||||
@@ -1173,6 +1203,12 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
|
||||
INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size);
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, obj);
|
||||
}
|
||||
while (!ncclIntruQueueEmpty(&plan->collnetHandleQueue)) {
|
||||
struct ncclCollnetHandleList* obj = ncclIntruQueueDequeue(&plan->collnetHandleQueue);
|
||||
NCCLCHECK(ncclCollnetDeregBuffer(comm, obj->proxyconn, obj->collnetHandle));
|
||||
INFO(NCCL_REG, "rank %d - deregistered collnet buffer handle %p, size %ld, buff %p", comm->rank, obj->collnetHandle, obj->size, obj->buffer);
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclCollnetHandleList, obj);
|
||||
}
|
||||
}
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
|
||||
return ncclSuccess;
|
||||
@@ -1512,7 +1548,7 @@ static ncclResult_t getTunerInfo(struct ncclInfo* collInfo, int collNetSupport,
|
||||
collInfo->nChannels = 0;
|
||||
if (collInfo->comm->tuner != NULL) {
|
||||
NCCLCHECK(collInfo->comm->tuner->getCollInfo(
|
||||
collInfo->coll, collInfo->nBytes,
|
||||
collInfo->comm->tunerContext, collInfo->coll, collInfo->nBytes,
|
||||
collNetSupport, nvlsSupport, numPipeOps,
|
||||
&collInfo->algorithm, &collInfo->protocol, &collInfo->nChannels));
|
||||
}
|
||||
@@ -1649,7 +1685,7 @@ static ncclResult_t setCollWorkElem(uint64_t workCount, uint64_t workOffset, siz
|
||||
static ncclResult_t initCollWorkElemReg(struct ncclComm* comm, struct ncclWorkElem* work, struct ncclChannel* channel, ncclRegBufferType regBufType, void* regBufSend[], void* regBufRecv[], struct ncclWorkElemReg* workElemReg) {
|
||||
if (regBufType == NCCL_IPC_REG_BUFFER) {
|
||||
workElemReg->elem = *work;
|
||||
workElemReg->elem.regUsed = 1;
|
||||
workElemReg->elem.regUsed = NCCL_IPC_REG_BUFFER;
|
||||
for (int i = 0; i < NCCL_MAX_DIRECT_ARITY; i++) {
|
||||
int peer = channel->collnetDirect.down[i];
|
||||
if (peer == -1) break;
|
||||
@@ -1666,10 +1702,13 @@ static ncclResult_t initCollWorkElemReg(struct ncclComm* comm, struct ncclWorkEl
|
||||
}
|
||||
} else if (regBufType == NCCL_NVLS_REG_BUFFER) {
|
||||
workElemReg->elem = *work;
|
||||
workElemReg->elem.regUsed = 1;
|
||||
workElemReg->elem.regUsed = NCCL_NVLS_REG_BUFFER;
|
||||
/* NVLS only has one send and recv buffer registered */
|
||||
workElemReg->dnInputs[0] = regBufSend[0];
|
||||
workElemReg->dnOutputs[0] = regBufRecv[0];
|
||||
} else if (regBufType == NCCL_COLLNET_REG_BUFFER) {
|
||||
workElemReg->elem = *work;
|
||||
workElemReg->elem.regUsed = NCCL_COLLNET_REG_BUFFER;
|
||||
} else {
|
||||
/* impossible value */
|
||||
WARN("Invalid regBufType %d\n", regBufType);
|
||||
@@ -1678,7 +1717,7 @@ static ncclResult_t initCollWorkElemReg(struct ncclComm* comm, struct ncclWorkEl
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(NvlsTreeChunkSize, "NVLSTREE_MAX_CHUNKSIZE", -2);
|
||||
NCCL_PARAM(NvlsTreeMaxChunkSize, "NVLSTREE_MAX_CHUNKSIZE", -2);
|
||||
|
||||
static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nBytes, int nChannels) {
|
||||
int stepSize = collInfo->comm->buffSizes[collInfo->protocol] / NCCL_STEPS;
|
||||
@@ -1701,7 +1740,7 @@ static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nByte
|
||||
while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth * 8 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
} else if (collInfo->algorithm == NCCL_ALGO_NVLS) {
|
||||
int maxChunkSize = 131072;
|
||||
int maxChunkSize = collInfo->comm->nvlsChunkSize;
|
||||
if (collInfo->comm->nNodes > 1 && collInfo->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
|
||||
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
|
||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
|
||||
@@ -1712,7 +1751,8 @@ static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nByte
|
||||
} else if (collInfo->algorithm == NCCL_ALGO_NVLS_TREE) {
|
||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
|
||||
uint64_t concurrentOps = nChannels * collInfo->comm->channels[0].nvls.nHeads;
|
||||
int maxChunkSize = ncclParamNvlsTreeChunkSize();
|
||||
chunkSize = collInfo->comm->nvlsChunkSize;
|
||||
int maxChunkSize = (int)ncclParamNvlsTreeMaxChunkSize();
|
||||
if (maxChunkSize == -2) maxChunkSize = collInfo->comm->nNodes >= 4 ? 65536 : chunkSize;
|
||||
chunkSize = std::min(chunkSize, maxChunkSize);
|
||||
if ((nBytes < (32 * (concurrentOps * chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
|
||||
@@ -1747,11 +1787,22 @@ static ncclResult_t initCollProxyOp(struct ncclInfo* collInfo, int channelId, ui
|
||||
proxyOp->pattern = collInfo->pattern;
|
||||
proxyOp->coll = collInfo->coll;
|
||||
proxyOp->root = collInfo->root;
|
||||
proxyOp->reg = 0;
|
||||
// This is used by P2P to reduce the receive buffer size. We don't use it in collectives
|
||||
// because some protocols need to transmit more than the total size, plus they sometimes
|
||||
// round up
|
||||
proxyOp->nbytes = collInfo->stepSize * proxyOp->sliceSteps;
|
||||
if (collInfo->regBufType == NCCL_COLLNET_REG_BUFFER) {
|
||||
proxyOp->reg = 1;
|
||||
proxyOp->nsteps = DIVUP(collInfo->nBytes, NCCL_MAX_COLLNET_SIZE);
|
||||
proxyOp->sendMhandle = collInfo->sendMhandle;
|
||||
proxyOp->recvMhandle = collInfo->recvMhandle;
|
||||
proxyOp->sendbuff = (uint8_t*)collInfo->sendbuff;
|
||||
proxyOp->recvbuff = (uint8_t*)collInfo->recvbuff;
|
||||
proxyOp->nbytes = collInfo->nBytes;
|
||||
} else {
|
||||
proxyOp->reg = 0;
|
||||
}
|
||||
|
||||
proxyOp->channelId = channelId;
|
||||
proxyOp->opCount = opCount;
|
||||
|
||||
@@ -1958,7 +2009,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int devOld = -1;
|
||||
|
||||
NCCLCHECKGOTO(PtrCheck(info->comm, info->opName, "comm"), ret, fail);
|
||||
NCCLCHECKGOTO(CommCheck(info->comm, info->opName, "comm"), ret, fail);
|
||||
// Check whether communicator is ready to communicate
|
||||
NCCLCHECKGOTO(ncclCommEnsureReady(info->comm), ret, fail);
|
||||
|
||||
@@ -1990,7 +2041,7 @@ fail:
|
||||
|
||||
NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
||||
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclRedOpCreatePreMulSum", "comm"));
|
||||
NCCLCHECK(CommCheck(comm, "ncclRedOpCreatePreMulSum", "comm"));
|
||||
/* join init thread before creating PreMulSum op. */
|
||||
NCCLCHECK(ncclCommEnsureReady(comm));
|
||||
|
||||
|
||||
+18
-8
@@ -17,6 +17,7 @@
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
|
||||
int rank = comm->rank;
|
||||
int localRanks = comm->topo->nodes[GPU].count;
|
||||
int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks;
|
||||
int nChannels = comm->nChannels;
|
||||
|
||||
topoRanks->nvlsHeadNum = 0;
|
||||
@@ -71,7 +72,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
|
||||
// Get nvls heads and the number of heads. Duplicate head is not allowed.
|
||||
for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
|
||||
bool addHead = true;
|
||||
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
|
||||
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks;
|
||||
|
||||
for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
|
||||
if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
|
||||
@@ -257,8 +258,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
|
||||
channel->nvls.nNodes = comm->nNodes;
|
||||
if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
|
||||
}
|
||||
// MNNVL: NVLS not yet supported
|
||||
if (comm->nNodes == 1 || comm->MNNVL) return ncclSuccess;
|
||||
if (comm->nNodes == 1) return ncclSuccess;
|
||||
|
||||
// Connect Trees
|
||||
int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
|
||||
@@ -310,9 +310,9 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
|
||||
|
||||
struct ncclNvls* nvls0 = &comm->channels[0].nvls;
|
||||
struct ncclNvls* nvls1 = &comm->channels[1].nvls;
|
||||
INFO(NCCL_GRAPH, "NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d",
|
||||
nvls0->treeDown[0], nvls0->treeDown[1], comm->rank, nvls0->treeUp,
|
||||
nvls1->treeDown[0], nvls1->treeDown[1], comm->rank, nvls1->treeUp);
|
||||
INFO(NCCL_GRAPH, "NVLS Trees : %d/%d/%d->%d->%d %d/%d/%d->%d->%d",
|
||||
nvls0->treeDown[0], nvls0->treeDown[1], nvls0->treeDown[2], comm->rank, nvls0->treeUp,
|
||||
nvls1->treeDown[0], nvls1->treeDown[1], nvls1->treeDown[2], comm->rank, nvls1->treeUp);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -363,13 +363,14 @@ void exchangeValues(int* v0, int* v1) {
|
||||
*v0 = tmp;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs) {
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
|
||||
// Gather data from all ranks
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
|
||||
int nranks = comm->nRanks;
|
||||
int nNodes = comm->nNodes;
|
||||
int nChannels = comm->nChannels;
|
||||
int minHeadNum = INT_MAX;
|
||||
int shared = parent && parent->nvlsSupport && parent->config.splitShare;
|
||||
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
|
||||
@@ -380,7 +381,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
|
||||
|
||||
// Alternate rings to avoid crossing rails
|
||||
if (graphs[NCCL_ALGO_RING]->crossNic && (comm->nNodes % 2) == 0 && (nChannels % 2) == 0) {
|
||||
if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
|
||||
for (int r=0; r<comm->nRanks; r++) {
|
||||
if (comm->rankToNode[r] % 2 == 1) {
|
||||
// Exchange rings
|
||||
@@ -469,11 +470,20 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
}
|
||||
|
||||
comm->collChannels = comm->nChannels;
|
||||
#if CUDART_VERSION >= 12010
|
||||
// Support maximal channel usage for aggregation
|
||||
if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
|
||||
comm->nvlsChannels = parent->nvlsResources->nChannels;
|
||||
}
|
||||
if (comm->nChannels < comm->nvlsChannels) {
|
||||
nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
|
||||
}
|
||||
NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
|
||||
#endif
|
||||
if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
|
||||
nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
|
||||
comm->collChannels = std::min(comm->collChannels, comm->nChannels);
|
||||
}
|
||||
|
||||
// Create rings array and check all is fine
|
||||
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
|
||||
|
||||
+32
-32
@@ -58,6 +58,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
|
||||
struct ncclTopoNode* remNode = link->remNode;
|
||||
if (remNode->paths[baseNode->type] == NULL) {
|
||||
NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
||||
for (int i=0; i<system->nodes[baseNode->type].count; i++) remNode->paths[baseNode->type][i].type = PATH_DIS;
|
||||
}
|
||||
struct ncclTopoLinkList* remPath;
|
||||
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
|
||||
@@ -110,11 +111,12 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
|
||||
}
|
||||
|
||||
static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* node) {
|
||||
char line[1024];
|
||||
const int linesize = 1024;
|
||||
char line[linesize];
|
||||
#ifdef ENABLE_TRACE
|
||||
INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
|
||||
#else
|
||||
sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
|
||||
snprintf(line, linesize, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
|
||||
int offset = strlen(line);
|
||||
#endif
|
||||
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
|
||||
@@ -126,12 +128,12 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
|
||||
for (int i=0; i<node->paths[t][n].count; i++) {
|
||||
struct ncclTopoLink* link = node->paths[t][n].list[i];
|
||||
struct ncclTopoNode* remNode = link->remNode;
|
||||
sprintf(line+offset, "--%s(%g)->%s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[remNode->type], remNode->id);
|
||||
snprintf(line+offset, linesize-offset, "--%s(%g)->%s/%lx-%lx", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[remNode->type], NCCL_TOPO_ID_SYSTEM_ID(remNode->id), NCCL_TOPO_ID_LOCAL_ID(remNode->id));
|
||||
offset = strlen(line);
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].bw);
|
||||
#else
|
||||
sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].bw, topoPathTypeStr[node->paths[t][n].type]);
|
||||
snprintf(line+offset, linesize-offset, "%s/%lx-%lx (%d/%.1f/%s) ", topoNodeTypeStr[t], NCCL_TOPO_ID_SYSTEM_ID(system->nodes[t].nodes[n].id), NCCL_TOPO_ID_LOCAL_ID(system->nodes[t].nodes[n].id), node->paths[t][n].count, node->paths[t][n].bw, topoPathTypeStr[node->paths[t][n].type]);
|
||||
offset = strlen(line);
|
||||
#endif
|
||||
}
|
||||
@@ -361,12 +363,12 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
|
||||
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
|
||||
int ncclTopoUserGdrLevel = -1;
|
||||
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) {
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int64_t netId, int read, int* useGdr) {
|
||||
*useGdr = 0;
|
||||
|
||||
// Get GPU and NET
|
||||
int n, g;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
@@ -403,18 +405,18 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
|
||||
if (distance == PATH_PXN) {
|
||||
// In case of PXN, use the intermediate GPU distance instead
|
||||
int proxyRank, g;
|
||||
NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
|
||||
struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
|
||||
distance = proxyGpu->paths[NET][n].type;
|
||||
}
|
||||
if (distance > netGdrLevel) {
|
||||
INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
|
||||
INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %lx (distance %d > %d)", busId, netId, distance, netGdrLevel);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
*useGdr = 1;
|
||||
INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read);
|
||||
INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %lx (distance %d <= %d), read %d", busId, netId, distance, netGdrLevel, read);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -465,10 +467,10 @@ ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
|
||||
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank) {
|
||||
// Get GPU and NET
|
||||
int n, g;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
struct ncclTopoLinkList* path = gpu->paths[NET]+n;
|
||||
@@ -480,7 +482,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank
|
||||
type = node->type;
|
||||
}
|
||||
if (type != GPU) {
|
||||
WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
|
||||
WARN("Could not find intermediate GPU between GPU rank %d and NIC %lx", rank, netId);
|
||||
return ncclInternalError;
|
||||
}
|
||||
*intermediateRank = node->gpu.rank;
|
||||
@@ -516,11 +518,12 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
|
||||
int nr = 0;
|
||||
int* ranks = NULL;
|
||||
for (int rank=0; rank<comm->nRanks; rank++) {
|
||||
int netDev, proxyRank;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank));
|
||||
int64_t netId;
|
||||
int proxyRank;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank));
|
||||
if (proxyRank == comm->rank) continue;
|
||||
int useGdr;
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netId, 1, &useGdr));
|
||||
if (useGdr == 0) continue;
|
||||
int found = 0;
|
||||
for (int r=0; r<nr; r++) {
|
||||
@@ -603,13 +606,14 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
if (ncclPxnDisable(comm) != 1) {
|
||||
int localGpuIndex;
|
||||
NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex));
|
||||
NCCLCHECK(ncclTopoGetLocalGpu(system, netNode->id, &localGpuIndex));
|
||||
if (localGpuIndex != g && localGpuIndex != -1) {
|
||||
// PXN = PCI + NVLink.
|
||||
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
|
||||
// Only use PXN for NIC n if remote GPU p ...
|
||||
if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI
|
||||
peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink
|
||||
NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && // Is on the same node as us
|
||||
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
|
||||
gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU
|
||||
// We can use that GPU as relay to communicate with that NIC.
|
||||
@@ -618,15 +622,17 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
|
||||
}
|
||||
}
|
||||
// Update path when we dont want to / can't use GPU Direct RDMA.
|
||||
int gdr;
|
||||
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
|
||||
if (gdr == 0) {
|
||||
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
|
||||
int localCpu;
|
||||
NCCLCHECK(getLocalCpu(system, g, &localCpu));
|
||||
NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
|
||||
NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
|
||||
if (gpu->paths[NET][n].type < PATH_PHB) {
|
||||
// Update path when we dont want to / can't use GPU Direct RDMA.
|
||||
int gdr;
|
||||
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
|
||||
if (gdr == 0) {
|
||||
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
|
||||
int localCpu;
|
||||
NCCLCHECK(getLocalCpu(system, g, &localCpu));
|
||||
NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
|
||||
NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -669,8 +675,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
|
||||
}
|
||||
|
||||
// MNNVL: Remove network nodes as they are connected via NVLink
|
||||
if (system->nodes[GPU].count == comm->nRanks || comm->MNNVL) {
|
||||
if (system->nodes[GPU].count == comm->nRanks) {
|
||||
for (int n=system->nodes[NET].count-1; n>=0; n--)
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
|
||||
}
|
||||
@@ -704,11 +709,6 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
|
||||
} else {
|
||||
*nChannels = 2;
|
||||
}
|
||||
} else if (comm->MNNVL) {
|
||||
// MNNVL assume all GPUs are connected via NVLink
|
||||
path = system->nodes[GPU].nodes[g].paths[GPU]+((g+1)%system->nodes[GPU].count);
|
||||
float nvlBw = ncclTopoNVLinkBw(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
|
||||
*nChannels = 2*std::max(1, (int)(path->bw / nvlBw));
|
||||
} else {
|
||||
// Remote rank, use network
|
||||
int nNetChannels = ncclParamNChannelsPerNetPeer();
|
||||
|
||||
+63
-48
@@ -4,6 +4,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "comm.h"
|
||||
#include "core.h"
|
||||
#include "graph.h"
|
||||
#include "topo.h"
|
||||
@@ -39,6 +40,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
|
||||
int inter = system->nodes[NET].count;
|
||||
if (inter == 0 && system->nodes[GPU].count == 1) {
|
||||
system->maxBw = LOC_BW;
|
||||
system->totalBw = LOC_BW;
|
||||
return ncclSuccess;
|
||||
}
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
@@ -115,7 +117,6 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc
|
||||
WARN("No path computed to go from %s/%d to %s/%d", topoNodeTypeStr[type1], index1, topoNodeTypeStr[type2], index2);
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (path->count == 0 ) return ncclSuccess;
|
||||
|
||||
// Now check link type
|
||||
*node = NULL;
|
||||
@@ -217,7 +218,7 @@ static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int*
|
||||
}
|
||||
|
||||
static ncclResult_t getNetPaths(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoLinkList** netPaths) {
|
||||
int netId = graph->inter[graph->nChannels*2];
|
||||
int64_t netId = graph->inter[graph->nChannels*2];
|
||||
int n;
|
||||
NCCLCHECK(getNetIndex(system, netId, &n));
|
||||
*netPaths=system->nodes[NET].nodes[n].paths[GPU];
|
||||
@@ -261,6 +262,8 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
|
||||
for (int i=0; i<count; i++) next[i] = scores[i].g;
|
||||
}
|
||||
|
||||
*countPtr = count;
|
||||
|
||||
if (system->nodes[NVS].count) {
|
||||
// NVSwitches prefer when we talk to a limited set of peers. Try to use neighbors first.
|
||||
int index = gpu-system->nodes[GPU].nodes;
|
||||
@@ -277,16 +280,18 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
|
||||
} else {
|
||||
firstGpus[0] = nextGpu; firstGpuCount = 1;
|
||||
}
|
||||
if (nextGpu == prevGpu && firstGpuCount == 2) firstGpuCount = 1;
|
||||
int firstGpuRealCount = 0;
|
||||
for (int g=0; g<firstGpuCount; g++) {
|
||||
for (i=0; i<count && next[i] != firstGpus[g]; i++);
|
||||
if (i<count) {
|
||||
for (; i>0; i--) next[i] = next[i-1];
|
||||
next[0] = firstGpus[g];
|
||||
firstGpuRealCount++;
|
||||
}
|
||||
}
|
||||
*countPtr = firstGpuRealCount;
|
||||
}
|
||||
|
||||
*countPtr = count;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -372,7 +377,6 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
|
||||
return ncclSuccess;
|
||||
}
|
||||
// 2. Try to get better bandwidth
|
||||
// Give a 5% perf bonus to paths not crossing nics
|
||||
if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) {
|
||||
*copy = 1;
|
||||
return ncclSuccess;
|
||||
@@ -405,8 +409,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
|
||||
localNetCount = 0;
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
for (int c = 0; c<MAXCHANNELS; c++) {
|
||||
int netId;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId));
|
||||
int64_t netId;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
|
||||
if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
|
||||
localNetCount++;
|
||||
@@ -427,7 +431,7 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
|
||||
localNetCount = 0;
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
struct ncclTopoLinkList* paths = gpu->paths[NET];
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
for (int n=0; n<system->nodes[NET].count && n<MAXCHANNELS; n++) {
|
||||
if (paths[n].type == t) localNets[localNetCount++] = n;
|
||||
}
|
||||
// Append NICs to list
|
||||
@@ -702,22 +706,25 @@ struct kvDict kvDictLinkType[] = {
|
||||
|
||||
ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int* inter = graph->inter+2*c;
|
||||
int64_t* inter = graph->inter+2*c;
|
||||
int* intra = graph->intra+ngpus*c;
|
||||
int n=0, g=0;
|
||||
for (int s=0; s<xmlChannel->nSubs; s++) {
|
||||
struct ncclXmlNode* sub = xmlChannel->subs[s];
|
||||
int dev;
|
||||
NCCLCHECK(xmlGetAttrInt(sub, "dev", &dev));
|
||||
int64_t dev;
|
||||
const char* str;
|
||||
NCCLCHECK(xmlGetAttrStr(sub, "dev", &str));
|
||||
dev = strtol(str, NULL, 16);
|
||||
if (strcmp(sub->name, "net") == 0) {
|
||||
inter[n++] = dev;
|
||||
} else if (strcmp(sub->name, "gpu") == 0) {
|
||||
int rank = -1;
|
||||
for (int g=0; g<ngpus; g++) {
|
||||
if (system->nodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank;
|
||||
int systemId = NCCL_TOPO_ID_SYSTEM_ID(system->nodes[GPU].nodes[g].id);
|
||||
if (NCCL_TOPO_ID(systemId, system->nodes[GPU].nodes[g].gpu.dev) == dev) rank = system->nodes[GPU].nodes[g].gpu.rank;
|
||||
}
|
||||
if (rank == -1) {
|
||||
WARN("XML Import Channel : dev %d not found.", dev);
|
||||
WARN("XML Import Channel : dev %ld not found.", dev);
|
||||
return ncclSystemError;
|
||||
}
|
||||
intra[g++] = rank;
|
||||
@@ -763,29 +770,33 @@ ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclT
|
||||
ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) {
|
||||
struct ncclXmlNode* xmlChannel;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int* inter = graph->inter+2*c;
|
||||
int64_t* inter = graph->inter+2*c;
|
||||
int* intra = graph->intra+ngpus*c;
|
||||
NCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel));
|
||||
struct ncclXmlNode* node;
|
||||
if (system->nodes[NET].count) {
|
||||
NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
|
||||
NCCLCHECK(xmlSetAttrInt(node, "dev", inter[0]));
|
||||
NCCLCHECK(xmlSetAttrLong(node, "dev", inter[0]));
|
||||
}
|
||||
for (int g=0; g<ngpus; g++) {
|
||||
NCCLCHECK(xmlAddNode(xml, xmlChannel, "gpu", &node));
|
||||
int dev = -1;
|
||||
int64_t dev = -1;
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
if (system->nodes[GPU].nodes[i].gpu.rank == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev;
|
||||
if (system->nodes[GPU].nodes[i].gpu.rank == intra[g]) {
|
||||
int systemId = NCCL_TOPO_ID_SYSTEM_ID(system->nodes[GPU].nodes[i].id);
|
||||
dev = NCCL_TOPO_ID(systemId, system->nodes[GPU].nodes[i].gpu.dev);
|
||||
}
|
||||
}
|
||||
if (dev == -1) {
|
||||
WARN("XML Export Channel : rank %d not found.", intra[g]);
|
||||
return ncclInternalError;
|
||||
}
|
||||
NCCLCHECK(xmlSetAttrInt(node, "dev", dev));
|
||||
NCCLCHECK(xmlSetAttrLong(node, "dev", dev));
|
||||
if (graph->id == 3) break; // NVLS graphs only use the first GPU
|
||||
}
|
||||
if (system->nodes[NET].count) {
|
||||
NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
|
||||
NCCLCHECK(xmlSetAttrInt(node, "dev", inter[1]));
|
||||
NCCLCHECK(xmlSetAttrLong(node, "dev", inter[1]));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -829,7 +840,7 @@ ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin, int ngp
|
||||
|
||||
int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
|
||||
memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
|
||||
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
|
||||
memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int64_t));
|
||||
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
|
||||
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
|
||||
graph->nChannels = dupChannels;
|
||||
@@ -841,7 +852,7 @@ float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0
|
||||
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
|
||||
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
|
||||
|
||||
float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
|
||||
float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 11.0, 6.0, 3.0 };
|
||||
float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
|
||||
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
|
||||
@@ -868,7 +879,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
if (str) {
|
||||
INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str);
|
||||
struct ncclXml* xml;
|
||||
NCCLCHECK(ncclCalloc(&xml, 1));
|
||||
NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES));
|
||||
NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml));
|
||||
int nChannels;
|
||||
NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels));
|
||||
@@ -907,7 +918,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
int speedIndex = 0;
|
||||
float maxBw = system->maxBw;
|
||||
float totalBw = system->totalBw;
|
||||
if (ngpus == 1 || graph->pattern != NCCL_TOPO_PATTERN_RING) totalBw *= ngpus*1.0/(ngpus-1);
|
||||
if (ngpus > 1 && graph->pattern != NCCL_TOPO_PATTERN_RING) totalBw *= ngpus*1.0/(ngpus-1);
|
||||
while ((speedArray[speedIndex] > maxBw || speedArray[speedIndex]*graph->minChannels > totalBw) && speedIndex < nspeeds-1) speedIndex++;
|
||||
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
|
||||
int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
|
||||
@@ -926,7 +937,7 @@ search:
|
||||
for (int g=0; g<ngpus; g++) {
|
||||
printf("%d ", graph->intra[c*ngpus+g]);
|
||||
}
|
||||
printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]);
|
||||
printf("[%lx %lx]", graph->inter[c*2+0], graph->inter[c*2+1]);
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
@@ -1041,7 +1052,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
|
||||
sprintf(line, "%2d :", c);
|
||||
int offset = strlen(line);
|
||||
if (system->nodes[NET].count > 0) {
|
||||
sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c]);
|
||||
sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
@@ -1049,7 +1060,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
|
||||
offset = strlen(line);
|
||||
}
|
||||
if (system->nodes[NET].count > 0) {
|
||||
sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c+1]);
|
||||
sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c+1]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
@@ -1062,7 +1073,7 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
|
||||
if (str) {
|
||||
INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str);
|
||||
struct ncclXml* xml;
|
||||
NCCLCHECK(ncclCalloc(&xml, 1));
|
||||
NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES));
|
||||
NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
|
||||
NCCLCHECK(ncclTopoDumpXmlToFile(str, xml));
|
||||
free(xml);
|
||||
@@ -1072,11 +1083,11 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
|
||||
|
||||
#include "comm.h"
|
||||
// NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
|
||||
ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int channelId, int* dev) {
|
||||
ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int channelId, int64_t* netId) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int localRanks = comm->topo->nodes[GPU].count;
|
||||
int netNum = 0;
|
||||
int net[MAXCHANNELS];
|
||||
int64_t net[MAXCHANNELS];
|
||||
|
||||
for (int c = 0; c < graph->nChannels; c++) {
|
||||
if (graph->intra[c * localRanks] == comm->rank) {
|
||||
@@ -1084,7 +1095,7 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i
|
||||
}
|
||||
}
|
||||
if (netNum) {
|
||||
*dev = net[channelId % netNum];
|
||||
*netId = net[channelId % netNum];
|
||||
} else {
|
||||
ret = ncclInternalError;
|
||||
goto fail;
|
||||
@@ -1100,23 +1111,30 @@ fail:
|
||||
// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
|
||||
NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2);
|
||||
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) {
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank) {
|
||||
int64_t netId = -1;
|
||||
int netDev = -1;
|
||||
if (graph) {
|
||||
// Honor the net device in the graph
|
||||
int channel = channelId%graph->nChannels;
|
||||
int ngpus = comm->topo->nodes[GPU].count;
|
||||
int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
|
||||
if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
|
||||
*dev = graph->inter[channel*2+index];
|
||||
netId = graph->inter[channel*2+index];
|
||||
} else {
|
||||
NCCLCHECK(getNvlsNetDev(comm, graph, channelId, dev));
|
||||
NCCLCHECK(getNvlsNetDev(comm, graph, channelId, &netId));
|
||||
}
|
||||
NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
|
||||
NCCLCHECK(ncclTopoIdToNetDev(comm->topo, netId, &netDev));
|
||||
if (dev) *dev = netDev;
|
||||
if (id) *id = netId;
|
||||
NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, netId, proxyRank));
|
||||
} else if (peerRank == -1) {
|
||||
return ncclInternalError;
|
||||
} else {
|
||||
// Start with our local NIC and local Rank
|
||||
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, dev));
|
||||
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, &netId, &netDev));
|
||||
if (dev) *dev = netDev;
|
||||
if (id) *id = netId;
|
||||
*proxyRank = rank;
|
||||
|
||||
int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel();
|
||||
@@ -1126,38 +1144,35 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
|
||||
int nvmlDev = comm->peerInfo[peerRank].nvmlDev;
|
||||
int localRank;
|
||||
if (ncclTopoDevToRank(comm->topo, nvmlDev, &localRank) != ncclSuccess) return ncclSuccess;
|
||||
int netDev;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netDev));
|
||||
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netId, &netDev));
|
||||
|
||||
int n;
|
||||
// Check that device exists on our node
|
||||
if (ncclParamCrossNic() == 0) {
|
||||
if (ncclTopoIdToIndex(comm->topo, NET, netDev, &n) != ncclSuccess) {
|
||||
WARN("Rank %d requires NIC %d but that NIC is not available for rank %d", peerRank, netDev, rank);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
*dev = netDev;
|
||||
if (dev) *dev = netDev;
|
||||
if (id) *id = netId;
|
||||
}
|
||||
if (pxnLevel == 1) {
|
||||
int g, n;
|
||||
NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g));
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &n));
|
||||
struct ncclTopoNode* gpu = comm->topo->nodes[GPU].nodes+g;
|
||||
if (gpu->paths[NET][n].type <= PATH_PXN) {
|
||||
*dev = netDev;
|
||||
if (dev) *dev = netDev;
|
||||
if (id) *id = netId;
|
||||
NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
|
||||
}
|
||||
} else if (pxnLevel == 2) {
|
||||
// Check which local GPU corresponds to that NIC and see if we can use PXN.
|
||||
int n, g1, g2;
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &n));
|
||||
NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
|
||||
NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netDev, &g2));
|
||||
NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netId, &g2));
|
||||
if (g2 != -1) {
|
||||
struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
|
||||
if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
|
||||
*proxyRank = peerGpu->gpu.rank;
|
||||
*dev = netDev;
|
||||
if (dev) *dev = netDev;
|
||||
if (id) *id = netId;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
+99
-41
@@ -15,13 +15,14 @@
|
||||
#include <fcntl.h>
|
||||
#include "xml.h"
|
||||
#include "cpuset.h"
|
||||
#include "bootstrap.h"
|
||||
|
||||
#define BUSID_SIZE (sizeof("0000:00:00.0"))
|
||||
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
|
||||
|
||||
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
|
||||
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
|
||||
|
||||
/******************************************************************/
|
||||
/******************* Graph Creation Functions *********************/
|
||||
@@ -156,9 +157,13 @@ ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int ind
|
||||
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw) {
|
||||
// Aggregate links into higher bw for NVLink
|
||||
struct ncclTopoLink* link;
|
||||
for (link = node->links; link->remNode; link++) {
|
||||
for (link = node->links; link - node->links != NCCL_TOPO_MAX_LINKS && link->remNode; link++) {
|
||||
if (link->remNode == remNode && link->type == type) break;
|
||||
}
|
||||
if (link - node->links == NCCL_TOPO_MAX_LINKS) {
|
||||
WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS);
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (link->remNode == NULL) node->nlinks++;
|
||||
link->type = type;
|
||||
link->remNode = remNode;
|
||||
@@ -218,6 +223,10 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
struct ncclTopoNode* remNode = sub->links[l].remNode;
|
||||
if (remNode == pciSwitch) continue;
|
||||
// Add link from parent PCI switch -> PCI device
|
||||
if (pciSwitch->nlinks == NCCL_TOPO_MAX_LINKS) {
|
||||
WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS);
|
||||
return ncclInternalError;
|
||||
}
|
||||
memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink));
|
||||
pciSwitch->nlinks++;
|
||||
// Update link from PCI device -> parent PCI switch
|
||||
@@ -243,11 +252,13 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
|
||||
// And connect all CPU nodes together
|
||||
for (int n=0; n<system->nodes[CPU].count; n++) {
|
||||
struct ncclTopoNode* cpu1 = system->nodes[CPU].nodes+n;
|
||||
for (int p=0; p<system->nodes[CPU].count; p++) {
|
||||
if (n == p) continue;
|
||||
struct ncclTopoNode* cpu2 = system->nodes[CPU].nodes+p;
|
||||
if (n == p || (NCCL_TOPO_ID_SYSTEM_ID(cpu1->id) != NCCL_TOPO_ID_SYSTEM_ID(cpu2->id))) continue;
|
||||
float bw;
|
||||
NCCLCHECK(ncclTopoGetInterCpuBw(system->nodes[CPU].nodes+n, &bw));
|
||||
NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, bw));
|
||||
NCCLCHECK(ncclTopoGetInterCpuBw(cpu1, &bw));
|
||||
NCCLCHECK(ncclTopoConnectNodes(cpu1, cpu2, LINK_SYS, bw));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -255,13 +266,13 @@ ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
|
||||
|
||||
static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
|
||||
if (node->type == GPU) {
|
||||
sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
|
||||
sprintf(line+offset, "%s/%lx-%lx (%d)", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id), node->gpu.rank);
|
||||
} else if (node->type == CPU) {
|
||||
sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
|
||||
sprintf(line+offset, "%s/%lx-%lx (%d/%d/%d)", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id), node->cpu.arch, node->cpu.vendor, node->cpu.model);
|
||||
} else if (node->type == PCI) {
|
||||
sprintf(line+offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device);
|
||||
sprintf(line+offset, "%s/%lx-%lx (%lx)", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id), node->pci.device);
|
||||
} else {
|
||||
sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
|
||||
sprintf(line+offset, "%s/%lx-%lx", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
for (int i=0; i<offset; i++) line[i] = ' ';
|
||||
@@ -328,12 +339,13 @@ ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
|
||||
ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int systemId) {
|
||||
int dev;
|
||||
NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
|
||||
|
||||
struct ncclTopoNode* net;
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev));
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &net, NET, NCCL_TOPO_ID(systemId, dev)));
|
||||
net->net.dev = dev;
|
||||
const char* str;
|
||||
NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
|
||||
if (str) sscanf(str, "0x%lx", &net->net.asic);
|
||||
@@ -356,14 +368,14 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
|
||||
ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int systemId) {
|
||||
for (int s=0; s<xmlNic->nSubs; s++) {
|
||||
struct ncclXmlNode* xmlNet = xmlNic->subs[s];
|
||||
if (strcmp(xmlNet->name, "net") != 0) continue;
|
||||
int index;
|
||||
NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
|
||||
if (index == -1) continue;
|
||||
NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));
|
||||
NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic, systemId));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -382,7 +394,7 @@ struct kvDict kvDictPciGen[] = {
|
||||
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
|
||||
{ "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
|
||||
{ NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
|
||||
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
|
||||
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId) {
|
||||
const char* str;
|
||||
|
||||
int type;
|
||||
@@ -401,7 +413,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
|
||||
int index;
|
||||
NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
|
||||
if (index == -1) return ncclSuccess;
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &node, type, NCCL_TOPO_ID(systemId, busId)));
|
||||
NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node));
|
||||
}
|
||||
struct ncclXmlNode* xmlNic = NULL;
|
||||
@@ -411,14 +423,15 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
|
||||
// Ignore sub device ID and merge multi-port NICs into one PCI device.
|
||||
busId &= 0xfffffffffffffff0;
|
||||
struct ncclTopoNode* nicNode = NULL;
|
||||
NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId));
|
||||
int64_t id = NCCL_TOPO_ID(systemId, busId);
|
||||
NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, id));
|
||||
if (nicNode == NULL) {
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, id));
|
||||
node = nicNode; // Connect it to parent later on
|
||||
}
|
||||
NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
|
||||
NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode, systemId));
|
||||
} else if (type == PCI) {
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &node, type, NCCL_TOPO_ID(systemId, busId)));
|
||||
NCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str));
|
||||
if (str) node->pci.device += strtol(str, NULL, 0) << 48;
|
||||
NCCLCHECK(xmlGetAttr(xmlPci, "device", &str));
|
||||
@@ -430,7 +443,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
|
||||
|
||||
for (int s=0; s<xmlPci->nSubs; s++) {
|
||||
struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
|
||||
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
|
||||
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -452,11 +465,25 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
|
||||
struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
|
||||
struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { " Shanghai ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } };
|
||||
|
||||
ncclResult_t ncclGetSystemId(struct ncclTopoSystem* system, struct ncclXmlNode* xmlCpu, int* systemIdPtr) {
|
||||
const char* hostHashStr;
|
||||
NCCLCHECK(xmlGetAttr(xmlCpu, "host_hash", &hostHashStr));
|
||||
uint64_t hostHash = hostHashStr ? strtoull(hostHashStr, NULL, 16) : 0;
|
||||
int systemId;
|
||||
for (systemId=0; systemId<system->nHosts; systemId++) if (system->hostHashes[systemId] == hostHash) break;
|
||||
if (systemId == system->nHosts) system->hostHashes[system->nHosts++] = hostHash;
|
||||
*systemIdPtr = systemId;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
|
||||
int numaId;
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
|
||||
int systemId;
|
||||
NCCLCHECK(ncclGetSystemId(system, xmlCpu, &systemId));
|
||||
struct ncclTopoNode* cpu;
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId));
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, NCCL_TOPO_ID(systemId, numaId)));
|
||||
const char* str;
|
||||
NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
|
||||
if (str != NULL) {
|
||||
@@ -482,26 +509,27 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
|
||||
}
|
||||
for (int s=0; s<xmlCpu->nSubs; s++) {
|
||||
struct ncclXmlNode* node = xmlCpu->subs[s];
|
||||
if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu));
|
||||
if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId));
|
||||
if (strcmp(node->name, "nic") == 0) {
|
||||
struct ncclTopoNode* nic = NULL;
|
||||
NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
|
||||
if (nic == NULL) {
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, NCCL_TOPO_ID(systemId, 0)));
|
||||
NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW));
|
||||
NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW));
|
||||
}
|
||||
NCCLCHECK(ncclTopoAddNic(node, system, nic));
|
||||
NCCLCHECK(ncclTopoAddNic(node, system, nic, systemId));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
|
||||
ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
|
||||
if (strcmp(node->name, "nvlink") == 0) {
|
||||
struct ncclTopoNode* gpu = NULL;
|
||||
int64_t pBusId;
|
||||
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
|
||||
pBusId = NCCL_TOPO_ID(systemId, pBusId);
|
||||
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
|
||||
if (gpu == NULL) {
|
||||
WARN("Add NVLink error : could not find GPU %lx", pBusId);
|
||||
@@ -520,7 +548,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
|
||||
NCCLCHECK(xmlGetAttrStr(node, "target", &target));
|
||||
int64_t busId;
|
||||
NCCLCHECK(busIdToInt64(target, &busId));
|
||||
NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));
|
||||
NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, NCCL_TOPO_ID(systemId, busId)));
|
||||
} else if (targetType == CPU) {
|
||||
// NVL connection to the local CPU
|
||||
NCCLCHECK(findLocalCpu(gpu, &remote));
|
||||
@@ -539,20 +567,24 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (strcmp(node->name, "cpu") == 0) {
|
||||
NCCLCHECK(ncclGetSystemId(system, node, &systemId));
|
||||
}
|
||||
const char* busId;
|
||||
NCCLCHECK(xmlGetAttr(node, "busid", &busId));
|
||||
for (int s=0; s<node->nSubs; s++) {
|
||||
NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId));
|
||||
NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId, systemId));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
|
||||
ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
|
||||
if (strcmp(node->name, "c2c") == 0) {
|
||||
struct ncclTopoNode* gpu = NULL;
|
||||
int64_t pBusId;
|
||||
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
|
||||
pBusId = NCCL_TOPO_ID(systemId, pBusId);
|
||||
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
|
||||
if (gpu == NULL) {
|
||||
WARN("Add NVLink error : could not find GPU %lx", pBusId);
|
||||
@@ -569,25 +601,31 @@ ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* sys
|
||||
NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw));
|
||||
NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw));
|
||||
} else {
|
||||
if (strcmp(node->name, "cpu") == 0) {
|
||||
NCCLCHECK(ncclGetSystemId(system, node, &systemId));
|
||||
}
|
||||
const char* busId;
|
||||
NCCLCHECK(xmlGetAttr(node, "busid", &busId));
|
||||
for (int s=0; s<node->nSubs; s++) {
|
||||
NCCLCHECK(ncclTopoAddC2c(node->subs[s], system, busId ? busId : parentBusId));
|
||||
NCCLCHECK(ncclTopoAddC2c(node->subs[s], system, busId ? busId : parentBusId, systemId));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {
|
||||
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, const uint64_t localHostHash) {
|
||||
NCCLCHECK(ncclCalloc(topoSystem, 1));
|
||||
struct ncclTopoSystem* system = *topoSystem;
|
||||
struct ncclXmlNode* topNode;
|
||||
NCCLCHECK(xmlFindTag(xml, "system", &topNode));
|
||||
for (int s=0; s<topNode->nSubs; s++) {
|
||||
struct ncclXmlNode* node = topNode->subs[s];
|
||||
if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
|
||||
}
|
||||
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
|
||||
NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL));
|
||||
for (int systemId=0; systemId<system->nHosts; systemId++) if (system->hostHashes[systemId] == localHostHash) system->systemId = systemId;
|
||||
|
||||
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));
|
||||
NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));
|
||||
|
||||
NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
|
||||
NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
|
||||
@@ -633,7 +671,7 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN
|
||||
|
||||
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
|
||||
struct ncclXml* xml;
|
||||
NCCLCHECK(ncclCalloc(&xml, 1));
|
||||
NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
|
||||
const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
|
||||
if (xmlTopoFile) {
|
||||
INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
|
||||
@@ -707,13 +745,32 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
|
||||
NCCLCHECK(ncclTopoTrimXml(xml));
|
||||
|
||||
if (comm->MNNVL) {
|
||||
// MNNVL clique support
|
||||
char* mem;
|
||||
NCCLCHECK(ncclCalloc(&mem, comm->clique.size * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
|
||||
struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*comm->cliqueRank);
|
||||
memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
|
||||
NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
|
||||
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
|
||||
struct ncclXml* cliqueXml;
|
||||
NCCLCHECK(xmlAlloc(&cliqueXml, comm->clique.size*NCCL_TOPO_XML_MAX_NODES));
|
||||
for (int i = 0; i < comm->clique.size; i++) {
|
||||
struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
|
||||
NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
|
||||
NCCLCHECK(ncclTopoFuseXml(cliqueXml, peerXml));
|
||||
}
|
||||
free(xml);
|
||||
xml = cliqueXml;
|
||||
}
|
||||
|
||||
xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
|
||||
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
|
||||
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
|
||||
NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclTopoGetSystemFromXml(xml, system));
|
||||
NCCLCHECK(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash));
|
||||
free(xml);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -761,7 +818,7 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
|
||||
int gpu;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
|
||||
int* localNets;
|
||||
@@ -773,15 +830,16 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
|
||||
int net = system->nodes[GPU].nodes[gpu].gpu.dev;
|
||||
if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
|
||||
net += channelId%(DIVUP(localNetCount,localGpuCount));
|
||||
*id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
|
||||
if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
|
||||
if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
|
||||
free(localNets);
|
||||
free(localGpus);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
|
||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
|
||||
int netIndex;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, net, &netIndex));
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex));
|
||||
int* localGpus = NULL;
|
||||
int localGpuCount;
|
||||
NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
|
||||
@@ -789,9 +847,9 @@ ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gp
|
||||
for (int lg=0; lg<localGpuCount; lg++) {
|
||||
int g = localGpus[lg];
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
int id;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id));
|
||||
if (net == id) {
|
||||
int64_t id;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL));
|
||||
if (netId == id) {
|
||||
*gpuIndex = g;
|
||||
free(localGpus);
|
||||
return ncclSuccess;
|
||||
|
||||
+26
-3
@@ -88,7 +88,7 @@ struct ncclTopoLink {
|
||||
float bw;
|
||||
struct ncclTopoNode* remNode;
|
||||
};
|
||||
#define NCCL_TOPO_MAX_LINKS 32
|
||||
#define NCCL_TOPO_MAX_LINKS 128
|
||||
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
|
||||
|
||||
struct ncclTopoLinkList {
|
||||
@@ -103,6 +103,10 @@ struct ncclTopoLinkList {
|
||||
|
||||
#define NCCL_TOPO_UNDEF (-1)
|
||||
|
||||
#define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56)
|
||||
#define NCCL_TOPO_ID_LOCAL_ID(id) (id & 0x00ffffffffffffff)
|
||||
#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + localid)
|
||||
|
||||
struct ncclTopoNode {
|
||||
int type;
|
||||
int64_t id;
|
||||
@@ -115,6 +119,7 @@ struct ncclTopoNode {
|
||||
int gdrSupport;
|
||||
}gpu;
|
||||
struct {
|
||||
int dev; // Plugin dev number
|
||||
uint64_t asic;
|
||||
int port;
|
||||
float bw;
|
||||
@@ -147,6 +152,9 @@ struct ncclTopoNodeSet {
|
||||
};
|
||||
|
||||
struct ncclTopoSystem {
|
||||
int systemId;
|
||||
uint64_t hostHashes[NCCL_TOPO_MAX_NODES];
|
||||
int nHosts;
|
||||
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
|
||||
float maxBw;
|
||||
float totalBw;
|
||||
@@ -158,9 +166,11 @@ ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id)
|
||||
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw);
|
||||
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank);
|
||||
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank);
|
||||
|
||||
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
|
||||
#define NCCL_TOPO_XML_MAX_NODES 256
|
||||
#define NCCL_GRAPH_XML_MAX_NODES 4096
|
||||
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, uint64_t localHostHash);
|
||||
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
|
||||
ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
|
||||
|
||||
@@ -191,6 +201,7 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
|
||||
static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, int* rank) {
|
||||
*rank = -1;
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
if (NCCL_TOPO_ID_SYSTEM_ID(system->nodes[GPU].nodes[i].id) != system->systemId) continue; // Only consider GPUs on our node
|
||||
if (system->nodes[GPU].nodes[i].gpu.dev == dev) {
|
||||
*rank = system->nodes[GPU].nodes[i].gpu.rank;
|
||||
return ncclSuccess;
|
||||
@@ -199,6 +210,18 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id, int* netDev) {
|
||||
*netDev = -1;
|
||||
for (int i=0; i<system->nodes[NET].count; i++) {
|
||||
if (system->nodes[NET].nodes[i].id == id) {
|
||||
*netDev = system->nodes[NET].nodes[i].net.dev;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("Could not find NET with id %lx\n", id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
// Returns NVLink bw in GB/s
|
||||
static float ncclTopoNVLinkBw(int cudaCompCap) {
|
||||
return
|
||||
|
||||
+8
-11
@@ -66,7 +66,7 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
||||
{ /* NVLINK */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
|
||||
/* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
|
||||
/* NVLS */ { 0, 0, 23 }, /* NVLSTree */ { 0, 0, 23 } },
|
||||
/* NVLS */ { 0, 0, 25 }, /* NVLSTree */ { 0, 0, 25 } },
|
||||
/* PCI */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
|
||||
/* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
|
||||
@@ -86,7 +86,7 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
||||
static const double llMaxBws[3][3] = {
|
||||
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
|
||||
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
|
||||
/* Hopper-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}
|
||||
/* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0}
|
||||
};
|
||||
|
||||
static const double perChMaxRingLL128Bws[3][3] = {
|
||||
@@ -132,8 +132,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
|
||||
getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
|
||||
|
||||
// MNNVL support - treat as a single NVLink connected node
|
||||
int nNodes = comm->MNNVL ? 1 : comm->nNodes;
|
||||
int nNodes = comm->nNodes;
|
||||
int nRanks = comm->nRanks;
|
||||
if (nRanks <= 1) return ncclSuccess;
|
||||
|
||||
@@ -178,7 +177,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
float busBw = graphs[a]->nChannels * bw;
|
||||
|
||||
// Various model refinements
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); }
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
|
||||
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
|
||||
@@ -190,7 +189,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
|
||||
busBw = ppn * bw;
|
||||
// AllGather/ReduceScatter requires 1:1 GPU:NIC
|
||||
int nicPerNode = comm->collNetHeadsUniqueNum;
|
||||
int nicPerNode = comm->collNetHeadsNum;
|
||||
if (coll == ncclFuncAllGather && comm->nNodes > 1) {
|
||||
if (!comm->ncclCollNet || !comm->ncclCollNet->iallgather || ppn > nicPerNode) busBw = 0;
|
||||
}
|
||||
@@ -282,15 +281,13 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
|
||||
}
|
||||
|
||||
// MNNVL: NVLS not yet supported
|
||||
if (comm->nNodes == 1 || comm->MNNVL) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
|
||||
if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
|
||||
|
||||
// Disable CollNet if it is not supported
|
||||
if (comm->collNetSupport == 0) {
|
||||
algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
|
||||
algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
|
||||
// MNNVL: NVLS not yet supported
|
||||
if (comm->nNodes > 1 || comm->MNNVL) algoEnable[NCCL_ALGO_NVLS] = 0;
|
||||
if (nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0;
|
||||
// If user has hard set NCCL_ALGO=COLLNET, ignore it
|
||||
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 &&
|
||||
algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) {
|
||||
@@ -437,7 +434,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
|
||||
int logSize = log2i(info->nBytes>>6);
|
||||
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
|
||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && (!info->comm->MNNVL && info->comm->nNodes > 1)
|
||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
|
||||
&& info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) {
|
||||
lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
|
||||
}
|
||||
|
||||
+86
-3
@@ -172,8 +172,8 @@ struct xmlHandler {
|
||||
ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head, struct xmlHandler handlers[], int nHandlers) {
|
||||
if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
|
||||
while (1) {
|
||||
if (xml->maxIndex == MAX_NODES) {
|
||||
WARN("Error : XML parser is limited to 1024 nodes");
|
||||
if (xml->maxIndex == xml->maxNodes) {
|
||||
WARN("Error : XML parser is limited to %d nodes", xml->maxNodes);
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
|
||||
@@ -198,7 +198,13 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea
|
||||
int found = 0;
|
||||
for (int h=0; h<nHandlers; h++) {
|
||||
if (strcmp(node->name, handlers[h].name) == 0) {
|
||||
if (head) head->subs[head->nSubs++] = node;
|
||||
if (head) {
|
||||
if (head->nSubs == MAX_SUBS) {
|
||||
WARN("Error : XML parser is limited to %d subnodes", MAX_SUBS);
|
||||
return ncclInternalError;
|
||||
}
|
||||
head->subs[head->nSubs++] = node;
|
||||
}
|
||||
node->parent = head;
|
||||
node->nSubs = 0;
|
||||
xml->maxIndex++;
|
||||
@@ -218,6 +224,23 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea
|
||||
/* XML Writer */
|
||||
/**************/
|
||||
|
||||
// exp == 1 -- serialize; exp == 0 -- deserialize
|
||||
ncclResult_t ncclTopoConvertXml(struct ncclXml* xml, uintptr_t base, int exp) {
|
||||
for (int n = 0; n < xml->maxIndex; n++) {
|
||||
struct ncclXmlNode *node = &xml->nodes[n];
|
||||
|
||||
// For "parent", we shift the base by 1 so that we can distinguish actual
|
||||
// NULL pointers from pointers pointing to the first node.
|
||||
if (node->parent)
|
||||
node->parent = (struct ncclXmlNode *) (exp ? ((uintptr_t)node->parent - base + 1) : (base - 1 + (uintptr_t)node->parent));
|
||||
|
||||
for (int s = 0; s < node->nSubs; s++) {
|
||||
node->subs[s] = (struct ncclXmlNode *) (exp ? ((uintptr_t)node->subs[s] - base) : (base + (uintptr_t)node->subs[s]));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node) {
|
||||
for (int i=0; i<indent; i++) fprintf(file, " ");
|
||||
fprintf(file, "<%s", node->name);
|
||||
@@ -249,6 +272,60 @@ ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml)
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src) {
|
||||
struct ncclXmlNode* topNode;
|
||||
NCCLCHECK(xmlFindTag(dst, "system", &topNode));
|
||||
|
||||
if (topNode == NULL) {
|
||||
xmlAddTree(dst, NULL, src->nodes);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Fuse the CPUs with the first XML
|
||||
struct ncclXmlNode* srcCpu;
|
||||
NCCLCHECK(xmlFindTag(src, "cpu", &srcCpu));
|
||||
while (srcCpu) {
|
||||
const char* srcNumaId;
|
||||
const char* srcHostHash;
|
||||
NCCLCHECK(xmlGetAttr(srcCpu, "numaid", &srcNumaId));
|
||||
if (srcNumaId == NULL) {
|
||||
WARN("TopoFuseXmls : could not find CPU numa ID.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
xmlGetAttr(srcCpu, "host_hash", &srcHostHash);
|
||||
if (srcHostHash == NULL)
|
||||
srcHostHash = "0";
|
||||
|
||||
// Search through the destination for a duplicate. Note that
|
||||
// this makes the complexity of this whole function O(n^2), but n
|
||||
// is expected to be small.
|
||||
struct ncclXmlNode* dstCpu;
|
||||
NCCLCHECK(xmlFindTag(dst, "cpu", &dstCpu));
|
||||
while (dstCpu) {
|
||||
const char* dstNumaId;
|
||||
const char* dstHostHash;
|
||||
NCCLCHECK(xmlGetAttr(dstCpu, "numaid", &dstNumaId));
|
||||
if (dstNumaId == NULL) {
|
||||
WARN("TopoFuseXmls : could not find CPU numa ID.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
xmlGetAttr(dstCpu, "host_hash", &dstHostHash);
|
||||
if (dstHostHash == NULL)
|
||||
dstHostHash = "0";
|
||||
if (strcmp(srcNumaId, dstNumaId) == 0 && strcmp(srcHostHash, dstHostHash) == 0)
|
||||
break;
|
||||
|
||||
NCCLCHECK(xmlFindNextTag(dst, "cpu", dstCpu, &dstCpu));
|
||||
}
|
||||
// Only add the CPU if no duplicate was found
|
||||
if (dstCpu == NULL)
|
||||
NCCLCHECK(xmlAddTree(dst, topNode, srcCpu));
|
||||
NCCLCHECK(xmlFindNextTag(src, "cpu", srcCpu, &srcCpu));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
/****************************************/
|
||||
/* Parser rules for our specific format */
|
||||
/****************************************/
|
||||
@@ -556,6 +633,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
|
||||
NCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr));
|
||||
if (parent == NULL) {
|
||||
NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
|
||||
NCCLCHECK(xmlSetAttrLong(parent, "host_hash", getHostHash()));
|
||||
NCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr));
|
||||
}
|
||||
} else if (slashCount == 2) {
|
||||
@@ -581,6 +659,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
|
||||
struct ncclXmlNode* topNode;
|
||||
NCCLCHECK(xmlFindTag(xml, "system", &topNode));
|
||||
NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
|
||||
NCCLCHECK(xmlSetAttrLong(parent, "host_hash", getHostHash()));
|
||||
NCCLCHECK(xmlSetAttr(parent, "numaid", "-1"));
|
||||
NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml));
|
||||
}
|
||||
@@ -595,6 +674,10 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
|
||||
NCCLCHECK(xmlGetAttr(parent->subs[s], "busid", &busId));
|
||||
if (busId != NULL && strcmp(newBusId, busId) < 0) { subIndex = s; break; }
|
||||
}
|
||||
if (parent->nSubs == MAX_SUBS) {
|
||||
WARN("Error : XML parser is limited to %d subnodes", MAX_SUBS);
|
||||
return ncclInternalError;
|
||||
}
|
||||
for (int s = parent->nSubs; s > subIndex; s--) parent->subs[s] = parent->subs[s-1];
|
||||
parent->subs[subIndex] = pciNode;
|
||||
parent->nSubs++;
|
||||
|
||||
+84
-7
@@ -10,13 +10,13 @@
|
||||
#include "nccl.h"
|
||||
#include "debug.h"
|
||||
#include "checks.h"
|
||||
#include "alloc.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
// A few constraints to make the implementation easy
|
||||
#define MAX_STR_LEN 255
|
||||
#define MAX_ATTR_COUNT 16
|
||||
#define MAX_SUBS 32
|
||||
#define MAX_NODES 1024
|
||||
#define MAX_SUBS 128
|
||||
|
||||
#define NODE_TYPE_NONE 0
|
||||
#define NODE_TYPE_OPEN 1
|
||||
@@ -37,8 +37,8 @@ struct ncclXmlNode {
|
||||
};
|
||||
|
||||
struct ncclXml {
|
||||
struct ncclXmlNode nodes[MAX_NODES];
|
||||
int maxIndex;
|
||||
int maxIndex, maxNodes;
|
||||
struct ncclXmlNode nodes[1];
|
||||
};
|
||||
|
||||
/* File functions */
|
||||
@@ -55,11 +55,27 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
|
||||
/* Remove unneeded parts */
|
||||
ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
|
||||
|
||||
/* Fuse multiple system XMLs into one, skipping duplicate CPUs */
|
||||
ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src);
|
||||
/* Relocate pointers in XML to (de-)serialize the structure */
|
||||
ncclResult_t ncclTopoConvertXml(struct ncclXml* xml, uintptr_t base, int exp);
|
||||
|
||||
/**************/
|
||||
/* XML Struct */
|
||||
/* Functions */
|
||||
/**************/
|
||||
|
||||
static size_t xmlMemSize(int maxNodes) {
|
||||
return offsetof(struct ncclXml, nodes) + sizeof(struct ncclXmlNode)*maxNodes;
|
||||
}
|
||||
static ncclResult_t xmlAlloc(struct ncclXml** xml, int maxNodes) {
|
||||
char* mem;
|
||||
NCCLCHECK(ncclCalloc(&mem, xmlMemSize(maxNodes)));
|
||||
*xml = (struct ncclXml*)mem;
|
||||
(*xml)->maxNodes = maxNodes;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrName, int* index) {
|
||||
*index = -1;
|
||||
const int nAttrs = node->nAttrs;
|
||||
@@ -101,6 +117,13 @@ static ncclResult_t xmlGetAttrIntDefault(struct ncclXmlNode* node, const char* a
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrName, int64_t* value) {
|
||||
const char* str;
|
||||
NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
|
||||
*value = strtol(str, NULL, 0);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) {
|
||||
const char* str;
|
||||
@@ -121,6 +144,18 @@ static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName, struct
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlFindNextTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode* prev, struct ncclXmlNode** node) {
|
||||
*node = NULL;
|
||||
for (int i=prev-xml->nodes+1; i<xml->maxIndex; i++) {
|
||||
struct ncclXmlNode* n = xml->nodes+i;
|
||||
if (strcmp(n->name, tagName) == 0) {
|
||||
*node = n;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node, const char* attrName, const char* attrValue) {
|
||||
*node = NULL;
|
||||
for (int i=0; i<xml->maxIndex; i++) {
|
||||
@@ -188,6 +223,19 @@ static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrNa
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlSetAttrLong(struct ncclXmlNode* node, const char* attrName, const int64_t value) {
|
||||
int index;
|
||||
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
|
||||
if (index == -1) {
|
||||
index = node->nAttrs++;
|
||||
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
}
|
||||
snprintf(node->attrs[index].value, MAX_STR_LEN, "%#lx", value);
|
||||
node->attrs[index].value[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* attrName) {
|
||||
int index;
|
||||
NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
|
||||
@@ -234,8 +282,8 @@ static ncclResult_t xmlGetSubKvInt(struct ncclXmlNode* node, const char* subName
|
||||
}
|
||||
|
||||
static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent, const char* subName, struct ncclXmlNode** sub) {
|
||||
if (xml->maxIndex == MAX_NODES) {
|
||||
WARN("Error : too many XML nodes (max %d)", MAX_NODES);
|
||||
if (xml->maxIndex == xml->maxNodes) {
|
||||
WARN("Error : too many XML nodes (max %d)", xml->maxNodes);
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclXmlNode* s = xml->nodes+xml->maxIndex++;
|
||||
@@ -243,7 +291,13 @@ static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent,
|
||||
s->nAttrs = 0;
|
||||
*sub = s;
|
||||
s->parent = parent;
|
||||
if (parent) parent->subs[parent->nSubs++] = s;
|
||||
if (parent) {
|
||||
if (parent->nSubs == MAX_SUBS) {
|
||||
WARN("Error : too many XML subnodes (max %d)", MAX_SUBS);
|
||||
return ncclInternalError;
|
||||
}
|
||||
parent->subs[parent->nSubs++] = s;
|
||||
}
|
||||
strncpy(s->name, subName, MAX_STR_LEN);
|
||||
s->name[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
@@ -262,6 +316,29 @@ static ncclResult_t xmlRemoveNode(struct ncclXmlNode* node) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlAddTree(struct ncclXml* dst, struct ncclXmlNode* parent, struct ncclXmlNode* srcNode) {
|
||||
if (dst->maxIndex == dst->maxNodes) {
|
||||
WARN("Error : too many XML nodes (max %d)", dst->maxNodes);
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclXmlNode* dstNode = dst->nodes+dst->maxIndex++;
|
||||
*dstNode = *srcNode;
|
||||
dstNode->parent = parent;
|
||||
if (parent) {
|
||||
if (parent->nSubs == MAX_SUBS) {
|
||||
WARN("Error : too many XML subnodes (max %d)", MAX_SUBS);
|
||||
return ncclInternalError;
|
||||
}
|
||||
parent->subs[parent->nSubs++] = dstNode;
|
||||
}
|
||||
dstNode->nSubs = 0;
|
||||
// Recursively copy the subtree(s)
|
||||
for (int i=0; i<srcNode->nSubs; i++)
|
||||
NCCLCHECK(xmlAddTree(dst, dstNode, srcNode->subs[i]));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
// Dictionary for STR -> INT conversions. No dictionary size information,
|
||||
// there needs to be a last element with str == NULL.
|
||||
struct kvDict {
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "info.h"
|
||||
|
||||
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
|
||||
ncclResult_t CommCheck(struct ncclComm* ptr, const char* opname, const char* ptrname);
|
||||
ncclResult_t ArgsCheck(struct ncclInfo* info);
|
||||
ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname);
|
||||
|
||||
|
||||
@@ -24,7 +24,9 @@ ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm*
|
||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
|
||||
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
|
||||
ncclResult_t bootstrapBarrier(void* commState, int rank, int nranks, int tag);
|
||||
ncclResult_t bootstrapBroadcast(void* commState, int rank, int nranks, int root, void* bcastData, int size);
|
||||
ncclResult_t bootstrapIntraNodeBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
|
||||
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
|
||||
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
|
||||
ncclResult_t bootstrapClose(void* commState);
|
||||
|
||||
+27
-3
@@ -87,6 +87,12 @@ struct ncclNodeRanks {
|
||||
int* localRankToRank;
|
||||
};
|
||||
|
||||
struct cliqueInfo {
|
||||
int id;
|
||||
int size;
|
||||
int *ranks;
|
||||
};
|
||||
|
||||
struct ncclDestructor {
|
||||
struct ncclDestructor* next;
|
||||
void* obj;
|
||||
@@ -165,6 +171,14 @@ struct ncclNvlsMcHandleList {
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct ncclCollnetHandleList {
|
||||
struct ncclCollnetHandleList *next;
|
||||
void* collnetHandle;
|
||||
size_t size;
|
||||
const void* buffer;
|
||||
struct ncclProxyConnector* proxyconn;
|
||||
};
|
||||
|
||||
struct ncclKernelPlan {
|
||||
// A kernel plan is also a callback that reclaims itself. Hence this must
|
||||
// be the first member.
|
||||
@@ -188,6 +202,7 @@ struct ncclKernelPlan {
|
||||
|
||||
struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
|
||||
struct ncclIntruQueue<struct ncclNvlsMcHandleList, &ncclNvlsMcHandleList::next> nvlsMcHandleQueue;
|
||||
struct ncclIntruQueue<struct ncclCollnetHandleList, &ncclCollnetHandleList::next> collnetHandleQueue;
|
||||
|
||||
struct Channel {
|
||||
int nWork;
|
||||
@@ -202,7 +217,10 @@ struct ncclKernelPlan {
|
||||
size_t maxBytesPerChannel;
|
||||
};
|
||||
|
||||
#define NCCL_MAGIC 0x0280028002800280 // Nickel atomic number is 28.
|
||||
|
||||
struct ncclComm {
|
||||
uint64_t startMagic;
|
||||
struct ncclMemoryStack memPermanent, memScoped;
|
||||
// List of destructors to run when comm is destructed
|
||||
struct ncclDestructor* destructorHead;
|
||||
@@ -245,7 +263,10 @@ struct ncclComm {
|
||||
int* localRankToRank;
|
||||
// localRanks and localRanktoRank for all nodes
|
||||
struct ncclNodeRanks* nodeRanks;
|
||||
int MNNVL; // MNNVL: Multi-Node NVLink
|
||||
// MNNVL: Multi-Node NVLink
|
||||
int MNNVL; // true when MNNVL is available
|
||||
struct cliqueInfo clique; // Our MNNVL clique information
|
||||
int cliqueRank; // Our rank within the MNNVL clique
|
||||
|
||||
bool checkPointers;
|
||||
bool dmaBufSupport;
|
||||
@@ -257,7 +278,6 @@ struct ncclComm {
|
||||
int nChannels; // connection nChannels
|
||||
int collChannels; // enqueue nChannels
|
||||
int nvlsChannels; // enqueue nChannels
|
||||
int collNetChannels;
|
||||
// Channels (per peer) for p2p
|
||||
int p2pnChannels;
|
||||
int p2pnChannelsPerPeer;
|
||||
@@ -269,6 +289,7 @@ struct ncclComm {
|
||||
// Buffer sizes
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
int p2pChunkSize;
|
||||
int nvlsChunkSize;
|
||||
|
||||
// Algorithm/Protocols thresholds
|
||||
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
@@ -315,11 +336,11 @@ struct ncclComm {
|
||||
int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
|
||||
// Whether this communicator uses collNet
|
||||
int collNetSupport;
|
||||
bool collNetRegSupport;
|
||||
uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
|
||||
int intraHighestTransportType;
|
||||
int* collNetHeads;
|
||||
int collNetHeadsNum;
|
||||
int collNetHeadsUniqueNum;
|
||||
int* collNetDenseToUserRank;
|
||||
int* collNetUserToDenseRank;
|
||||
/* sharable collNet proxy progress resource. */
|
||||
@@ -336,6 +357,7 @@ struct ncclComm {
|
||||
struct ncclMemoryPool memPool_ncclKernelPlan;
|
||||
struct ncclMemoryPool memPool_ncclPointerList;
|
||||
struct ncclMemoryPool memPool_ncclNvlsHandleList;
|
||||
struct ncclMemoryPool memPool_ncclCollnetHandleList;
|
||||
// Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
|
||||
// this comm is not yet in a group.
|
||||
struct ncclComm* groupNext;
|
||||
@@ -368,8 +390,10 @@ struct ncclComm {
|
||||
|
||||
// Tuning plugin
|
||||
ncclTuner_t* tuner;
|
||||
void *tunerContext;
|
||||
// buffer registration cache
|
||||
struct ncclRegCache regCache;
|
||||
uint64_t endMagic;
|
||||
};
|
||||
|
||||
enum ncclLaunchMode {
|
||||
|
||||
+30
-40
@@ -20,10 +20,6 @@ extern int ncclCuMemEnable();
|
||||
// Handle type used for cuMemCreate()
|
||||
extern CUmemAllocationHandleType ncclCuMemHandleType;
|
||||
|
||||
#else
|
||||
typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
|
||||
typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
|
||||
typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
|
||||
#endif
|
||||
|
||||
#define CUPFN(symbol) pfn_##symbol
|
||||
@@ -69,53 +65,47 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol
|
||||
#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute);
|
||||
// cuMem API support
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemCreate);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemMap);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRelease);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap);
|
||||
#if CUDA_VERSION >= 11070
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
|
||||
#endif
|
||||
#if CUDA_VERSION >= 12010
|
||||
/* NVSwitch Multicast support */
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* CUDA Driver functions loaded with dlsym() */
|
||||
DECLARE_CUDA_PFN_EXTERN(cuInit, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
|
||||
|
||||
|
||||
ncclResult_t ncclCudaLibraryInit(void);
|
||||
|
||||
extern int ncclCudaDriverVersionCache;
|
||||
|
||||
+19
-2
@@ -84,6 +84,15 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
|
||||
#define NCCL_IPC_READ 0x10
|
||||
#define NCCL_NVLS_MIN_POLL 0x20
|
||||
|
||||
#define NCCL_MAX_COLLNET_SIZE (1L << 29)
|
||||
|
||||
enum ncclRegBufferType {
|
||||
NCCL_REGULAR_BUFFER = 0,
|
||||
NCCL_IPC_REG_BUFFER = 1,
|
||||
NCCL_NVLS_REG_BUFFER = 2,
|
||||
NCCL_COLLNET_REG_BUFFER = 3
|
||||
};
|
||||
|
||||
struct ncclConnInfo {
|
||||
// Regular comm mechanism
|
||||
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
|
||||
@@ -93,6 +102,7 @@ struct ncclConnInfo {
|
||||
|
||||
int flags; // Direct communication / other flags
|
||||
int shared; // Buffers are shared
|
||||
int stepSize; // Step size for the SIMPLE buffer
|
||||
void **ptrExchange; // Pointer exchange for direct communication
|
||||
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
|
||||
|
||||
@@ -157,7 +167,7 @@ struct ncclDirect {
|
||||
int down[NCCL_MAX_DIRECT_ARITY];
|
||||
};
|
||||
|
||||
#define NCCL_MAX_NVLS_ARITY 8
|
||||
#define NCCL_MAX_NVLS_ARITY 32
|
||||
#define NCCL_MAX_NVLS_TREE_ARITY 3
|
||||
struct ncclNvls {
|
||||
int out;
|
||||
@@ -171,6 +181,12 @@ struct ncclNvls {
|
||||
int nNodes;
|
||||
};
|
||||
|
||||
#if __CUDA_ARCH__ >= 900
|
||||
#define NCCL_MAX_ARITY NCCL_MAX_NVLS_ARITY
|
||||
#else
|
||||
#define NCCL_MAX_ARITY NCCL_MAX_DIRECT_ARITY
|
||||
#endif
|
||||
|
||||
#define NCCL_MAX_CONNS 2
|
||||
struct ncclChannelPeer {
|
||||
struct ncclConnector send[NCCL_MAX_CONNS];
|
||||
@@ -212,9 +228,10 @@ struct ncclWorkElem {
|
||||
union {
|
||||
uint8_t flagBits;
|
||||
struct {
|
||||
uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, oneNode:1;
|
||||
uint8_t isUsed:1, redOpArgIsPtr:1, oneNode:1;
|
||||
};
|
||||
};
|
||||
uint8_t regUsed;
|
||||
uint8_t nWarps;
|
||||
uint8_t direct;
|
||||
uint32_t root;
|
||||
|
||||
@@ -31,10 +31,10 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
|
||||
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
|
||||
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
|
||||
ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int64_t netId, int read, int* useGdr);
|
||||
ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
|
||||
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
|
||||
int ncclPxnDisable(struct ncclComm* comm);
|
||||
@@ -56,8 +56,8 @@ ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vend
|
||||
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
|
||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev);
|
||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex);
|
||||
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);
|
||||
|
||||
#define NCCL_TOPO_MAX_NODES 256
|
||||
@@ -88,7 +88,7 @@ struct ncclTopoGraph {
|
||||
int sameChannels;
|
||||
int nHops;
|
||||
int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
|
||||
int inter[MAXCHANNELS*2];
|
||||
int64_t inter[MAXCHANNELS*2];
|
||||
};
|
||||
ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
|
||||
|
||||
@@ -110,7 +110,7 @@ struct ncclTopoRanks {
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs);
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent);
|
||||
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
|
||||
#include "info.h"
|
||||
|
||||
@@ -31,13 +31,6 @@ typedef enum : uint8_t {
|
||||
ncclPatternRecv
|
||||
} ncclPattern_t;
|
||||
|
||||
enum ncclRegBufferType {
|
||||
NCCL_REGULAR_BUFFER = 0,
|
||||
NCCL_IPC_REG_BUFFER = 1,
|
||||
NCCL_NVLS_REG_BUFFER = 2,
|
||||
NCCL_REG_BUFFER_NUM = 3
|
||||
};
|
||||
|
||||
// Used to pass NCCL call information between functions
|
||||
struct ncclInfo {
|
||||
ncclFunc_t coll;
|
||||
@@ -70,6 +63,9 @@ struct ncclInfo {
|
||||
ncclRegBufferType regBufType;
|
||||
void* regBufSend[NCCL_MAX_LOCAL_RANKS];
|
||||
void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
|
||||
// collnet buffer reg handles
|
||||
void* sendMhandle;
|
||||
void* recvMhandle;
|
||||
// Need to initialize
|
||||
int nThreads;
|
||||
int nChannels;
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#define NCCL_DEBUG_H_
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
|
||||
@@ -17,13 +17,17 @@ typedef struct {
|
||||
const char* name;
|
||||
|
||||
// Initializes tuner states.
|
||||
// nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// nNodes: number of nodes in current communicator.
|
||||
// logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
|
||||
// Inputs:
|
||||
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// - nNodes: number of nodes in current communicator.
|
||||
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
// Outputs:
|
||||
// - context: tuner context object
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
|
||||
|
||||
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
|
||||
// Inputs:
|
||||
// - context: tuner context object
|
||||
// - collType: collective type , e.g., allreduce, allgather…
|
||||
// - nBytes: collective size in bytes
|
||||
// - collNetTypeSupport: whether collnet supports this type
|
||||
@@ -40,16 +44,17 @@ typedef struct {
|
||||
// Also, the plugin is allowed to not set any output, or set only the
|
||||
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int *algorithm, int *protocol, int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
ncclResult_t (*destroy)();
|
||||
} ncclTuner_v1_t;
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v2_t;
|
||||
|
||||
typedef ncclTuner_v1_t ncclTuner_t;
|
||||
typedef ncclTuner_v2_t ncclTuner_t;
|
||||
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
|
||||
|
||||
#endif
|
||||
|
||||
+18
-2
@@ -33,7 +33,6 @@ union ncclProxyOpSpecifics {
|
||||
|
||||
struct ncclProxyOp {
|
||||
struct ncclProxyConnection* connection;
|
||||
void* buffer;
|
||||
ssize_t nbytes;
|
||||
uint64_t opCount;
|
||||
int root;
|
||||
@@ -49,6 +48,11 @@ struct ncclProxyOp {
|
||||
uint8_t /*ncclPattern_t*/ pattern;
|
||||
uint8_t protocol;
|
||||
uint8_t reg;
|
||||
// collnet buffer reg handles
|
||||
void* sendMhandle;
|
||||
void* recvMhandle;
|
||||
uint8_t* sendbuff;
|
||||
uint8_t* recvbuff;
|
||||
|
||||
union ncclProxyOpSpecifics specifics;
|
||||
|
||||
@@ -58,8 +62,14 @@ struct ncclProxyOp {
|
||||
struct ncclProxySubArgs {
|
||||
struct ncclProxyConnection* connection;
|
||||
int reg;
|
||||
void* buffer;
|
||||
// p2p mhandle
|
||||
void* mhandle;
|
||||
// collnet handles
|
||||
void* sendMhandle;
|
||||
void* recvMhandle;
|
||||
uint8_t* sendbuff;
|
||||
uint8_t* recvbuff;
|
||||
size_t offset;
|
||||
int channelId;
|
||||
int nsteps;
|
||||
ssize_t nbytes;
|
||||
@@ -88,6 +98,10 @@ struct ncclProxyArgs {
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int chunkSize;
|
||||
size_t totalSendSize;
|
||||
size_t totalRecvSize;
|
||||
size_t sendSizePerRound;
|
||||
size_t recvSizePerRound;
|
||||
uint8_t /*ncclDataType_t*/ dtype;
|
||||
uint8_t /*ncclDevRedOp_t*/ redOp;
|
||||
uint8_t /*ncclPattern_t*/ pattern;
|
||||
@@ -302,6 +316,8 @@ enum ncclProxyMsgType {
|
||||
ncclProxyMsgAbort = 7,
|
||||
ncclProxyMsgStop = 8,
|
||||
ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
|
||||
ncclProxyMsgRegister = 10,
|
||||
ncclProxyMsgDeregister = 11
|
||||
};
|
||||
|
||||
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
|
||||
|
||||
@@ -5,7 +5,8 @@ enum {
|
||||
NET_REG_COMPLETE = 0x01,
|
||||
NVLS_REG_COMPLETE = 0x02,
|
||||
NVLS_REG_POSSIBLE = 0x04,
|
||||
NVLS_REG_NO_SUPPORT = 0x08
|
||||
NVLS_REG_NO_SUPPORT = 0x08,
|
||||
COLLNET_REG_COMPLETE = 0x10
|
||||
};
|
||||
|
||||
struct ncclReg {
|
||||
@@ -26,6 +27,9 @@ struct ncclReg {
|
||||
int dev;
|
||||
CUmemGenericAllocationHandle mcHandle;
|
||||
uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
|
||||
// collnet reg
|
||||
void* collnetHandle;
|
||||
struct ncclProxyConnector* proxyconn;
|
||||
};
|
||||
|
||||
struct ncclRegCache {
|
||||
|
||||
@@ -92,6 +92,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
|
||||
ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
|
||||
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
|
||||
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
|
||||
ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
|
||||
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
|
||||
ncclResult_t ncclSocketClose(struct ncclSocket* sock);
|
||||
#endif
|
||||
|
||||
@@ -95,6 +95,8 @@ struct ncclTransportComm {
|
||||
ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
|
||||
ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState);
|
||||
ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*);
|
||||
ncclResult_t (*proxyRegister)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
|
||||
ncclResult_t (*proxyDeregister)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done);
|
||||
};
|
||||
|
||||
struct ncclTransport {
|
||||
@@ -107,15 +109,6 @@ struct ncclTransport {
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
|
||||
|
||||
// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange
|
||||
#define USE_POSIX_FD 1
|
||||
|
||||
#if USE_POSIX_FD
|
||||
#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
|
||||
#else
|
||||
#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE
|
||||
#endif
|
||||
|
||||
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
|
||||
ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
|
||||
@@ -124,7 +117,10 @@ ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdevi
|
||||
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
|
||||
|
||||
enum { collNetRecv=0, collNetSend=1 };
|
||||
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
|
||||
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect);
|
||||
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
|
||||
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
|
||||
ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle);
|
||||
ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKernelPlan *plan, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle);
|
||||
ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle);
|
||||
#endif
|
||||
|
||||
@@ -15,8 +15,8 @@
|
||||
// Attempts to load NCCL tuner from environmental variable.
|
||||
// Returns ncclSuccess if the correct tuner symbol has been found and
|
||||
// successully loaded. Otherwise returns an error and also logs the error.
|
||||
ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner);
|
||||
ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner);
|
||||
|
||||
// Cleans up NCCL tuner plugin.
|
||||
ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner);
|
||||
ncclResult_t ncclTunerPluginUnload(ncclTuner_t** tuner);
|
||||
#endif
|
||||
|
||||
+168
-114
@@ -117,6 +117,7 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
|
||||
void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
|
||||
// Important that this does not trash intraComm0.
|
||||
comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1;
|
||||
comm->startMagic = comm->endMagic = 0;
|
||||
}
|
||||
|
||||
#undef NCCL_NO_OPTIMIZE
|
||||
@@ -280,7 +281,6 @@ static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
|
||||
ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
|
||||
/* comm must be ready, or error will be reported */
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
|
||||
ncclGroupJobAbort(comm->groupJob);
|
||||
} else {
|
||||
@@ -351,6 +351,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
|
||||
ncclMemoryPoolConstruct(&comm->memPool_ncclPointerList);
|
||||
ncclMemoryPoolConstruct(&comm->memPool_ncclNvlsHandleList);
|
||||
ncclMemoryPoolConstruct(&comm->memPool_ncclCollnetHandleList);
|
||||
|
||||
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
@@ -560,9 +561,8 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
|
||||
comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
|
||||
}
|
||||
|
||||
// MNNVL support
|
||||
if (!comm->MNNVL && comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
|
||||
else if (comm->MNNVL || ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
|
||||
if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
|
||||
else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
|
||||
else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
|
||||
|
||||
// Make sure P2P chunksize is not larger than coll chunksize.
|
||||
@@ -584,16 +584,38 @@ NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
|
||||
NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1);
|
||||
NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0);
|
||||
|
||||
static ncclResult_t collNetInitRailRankMap(ncclComm_t comm) {
|
||||
int rank = comm->rank;
|
||||
uint64_t nonHeadMask = (1ull << comm->localRanks) - 1;
|
||||
|
||||
comm->collNetDenseToUserRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
|
||||
comm->collNetUserToDenseRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
|
||||
// initialize collNetUserToDenseRank[rank]
|
||||
comm->collNetUserToDenseRank[rank] = -1;
|
||||
for (int h = 0; h < comm->collNetHeadsNum; h++) {
|
||||
nonHeadMask ^= 1ull << comm->rankToLocalRank[comm->collNetHeads[h]];
|
||||
if (comm->collNetHeads[h] == rank) { comm->collNetUserToDenseRank[rank] = h; break; }
|
||||
}
|
||||
if (comm->collNetUserToDenseRank[rank] == -1) {
|
||||
comm->collNetUserToDenseRank[rank] = __builtin_popcountll(nonHeadMask & ((1ull << comm->localRank) - 1));
|
||||
}
|
||||
comm->collNetUserToDenseRank[rank] += comm->node * comm->localRanks;
|
||||
|
||||
NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->collNetUserToDenseRank, sizeof(int)));
|
||||
for (int r = 0; r < comm->nRanks; r++) {
|
||||
comm->collNetDenseToUserRank[comm->collNetUserToDenseRank[r]] = r;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* collNetGraph) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int* heads = NULL;
|
||||
int rank = comm->rank;
|
||||
int collNetSetupFail = 0;
|
||||
int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_P2P };
|
||||
// Find all head ranks
|
||||
int nHeads = collNetGraph->nChannels;
|
||||
int nHeadsUnique = 0;
|
||||
int headsUnique[NCCL_MAX_LOCAL_RANKS];
|
||||
int* headsUnique = NULL;
|
||||
int highestTransportType0, highestTransportType1;
|
||||
char line[1024];
|
||||
bool share;
|
||||
@@ -604,27 +626,26 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
|
||||
};
|
||||
struct collnetShareInfo* infos = NULL;
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&heads, nHeads), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(&headsUnique, collNetGraph->nChannels), ret, fail);
|
||||
{ uint64_t mask = 0;
|
||||
// Head GPU index is always 0
|
||||
for (int c = 0; c < nHeads; c++) {
|
||||
heads[c] = collNetGraph->intra[c * comm->localRanks + 0];
|
||||
assert(comm->rankToNode[heads[c]] == comm->node);
|
||||
for (int c = 0; c < collNetGraph->nChannels; c++) {
|
||||
int head = collNetGraph->intra[c * comm->localRanks + 0];
|
||||
assert(comm->rankToNode[head] == comm->node);
|
||||
uint64_t mask0 = mask;
|
||||
mask |= 1ull<<comm->rankToLocalRank[heads[c]];
|
||||
if (mask != mask0) headsUnique[nHeadsUnique++] = heads[c];
|
||||
mask |= 1ull<<comm->rankToLocalRank[head];
|
||||
if (mask != mask0) headsUnique[nHeadsUnique++] = head;
|
||||
}
|
||||
}
|
||||
|
||||
comm->collNetHeads = heads;
|
||||
comm->collNetHeadsNum = nHeads;
|
||||
comm->collNetHeadsUniqueNum = nHeadsUnique;
|
||||
comm->collNetHeads = headsUnique;
|
||||
comm->collNetHeadsNum = nHeadsUnique;
|
||||
if (parent && parent->collNetSupport && parent->config.splitShare && parent->nNodes == comm->nNodes) {
|
||||
NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail);
|
||||
/* check whether child can share collnet resources of parent. Since parent builds each collnet communicator
|
||||
* based on heads with the same head position in each node, as long as the collnet heads of child comm
|
||||
* can match parent's heads, we can let child communicator share parent's collnet resources. */
|
||||
for (int h = 0; h < nHeads; ++h) {
|
||||
for (int h = 0; h < nHeadsUnique; ++h) {
|
||||
int prev = INT_MIN;
|
||||
struct collnetShareInfo* myinfo;
|
||||
|
||||
@@ -632,7 +653,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
|
||||
myinfo = infos + comm->rank;
|
||||
memset(myinfo, 0, sizeof(struct collnetShareInfo));
|
||||
/* find the child head position in parent collnet heads. */
|
||||
if (heads[h] == comm->rank) {
|
||||
if (headsUnique[h] == comm->rank) {
|
||||
myinfo->headPosition = -1;
|
||||
myinfo->isMaster = 1;
|
||||
for (int th = 0; th < parent->collNetHeadsNum; ++th)
|
||||
@@ -658,10 +679,11 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
|
||||
if (share) {
|
||||
if (myinfo->isMaster) {
|
||||
comm->collNetSharedRes = parent->collNetSharedRes;
|
||||
comm->collNetChannels = std::min(comm->nChannels, parent->collNetSharedRes->nChannels);
|
||||
for (int c = 0; c < comm->collNetChannels; ++c)
|
||||
for (int c = 0; c < comm->nChannels; ++c)
|
||||
NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail);
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail);
|
||||
} else {
|
||||
/* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot
|
||||
* share the sharp resource from parent, we cannot use sharp in this case. This restriction might be
|
||||
@@ -677,35 +699,19 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
|
||||
} else {
|
||||
/* this allocated buffer will be freed on proxy side */
|
||||
NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1));
|
||||
comm->collNetChannels = comm->collNetSharedRes->nChannels = comm->nChannels;
|
||||
comm->collNetSharedRes->nChannels = comm->nChannels;
|
||||
comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
|
||||
|
||||
comm->collNetDenseToUserRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
|
||||
comm->collNetUserToDenseRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
|
||||
{ // initialize collNetUserToDenseRank[rank]
|
||||
uint64_t nonHeadMask = (1ull<<comm->localRanks)-1;
|
||||
comm->collNetUserToDenseRank[rank] = -1;
|
||||
for (int h=0; h < nHeadsUnique; h++) {
|
||||
nonHeadMask ^= 1ull<<comm->rankToLocalRank[headsUnique[h]];
|
||||
if (headsUnique[h] == rank) { comm->collNetUserToDenseRank[rank] = h; break; }
|
||||
}
|
||||
if (comm->collNetUserToDenseRank[rank] == -1) {
|
||||
comm->collNetUserToDenseRank[rank] = __builtin_popcountll(nonHeadMask & ((1ull<<comm->localRank)-1));
|
||||
}
|
||||
comm->collNetUserToDenseRank[rank] += comm->node*comm->localRanks;
|
||||
}
|
||||
NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->collNetUserToDenseRank, sizeof(int)));
|
||||
for (int r=0; r < comm->nRanks; r++) {
|
||||
comm->collNetDenseToUserRank[comm->collNetUserToDenseRank[r]] = r;
|
||||
}
|
||||
NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail);
|
||||
|
||||
for (int c = 0; c < comm->collNetChannels; c++) {
|
||||
for (int c = 0; c < comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels + c;
|
||||
NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail);
|
||||
for (int h = 0; h < nHeads; h++) {
|
||||
const int head = heads[h];
|
||||
collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
|
||||
if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
|
||||
for (int h = 0; h < nHeadsUnique; h++) {
|
||||
const int head = headsUnique[h];
|
||||
ncclConnect connect;
|
||||
collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv, &connect);
|
||||
if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend, &connect);
|
||||
}
|
||||
// Verify CollNet setup across ranks after trying the first channel
|
||||
if (c == 0) {
|
||||
@@ -727,7 +733,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
|
||||
bool isHead = false;
|
||||
matrix = nullptr;
|
||||
NCCLCHECKGOTO(ncclCalloc(&matrix, comm->nRanks), ret, matrix_end);
|
||||
for (int h = 0; h < nHeads; h++) isHead |= (heads[h] == comm->rank);
|
||||
for (int h = 0; h < nHeadsUnique; h++) isHead |= (headsUnique[h] == comm->rank);
|
||||
if (isHead) {
|
||||
for (int ty=0; ty < ncclNumTypes; ty++) {
|
||||
for (int i=0; i < 4; i++) {
|
||||
@@ -817,7 +823,72 @@ fail:
|
||||
}
|
||||
|
||||
// MNNVL: Flag to indicate whether to enable Multi-Node NVLink
|
||||
NCCL_PARAM(MNNVL, "MNNVL", -2);
|
||||
NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cudawrap.h"
|
||||
|
||||
// Determine if MNNVL support is available
|
||||
static int checkMNNVL(struct ncclComm* comm) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// MNNVL requires cuMem to be enabled
|
||||
if (!ncclCuMemEnable()) return 0;
|
||||
|
||||
// MNNVL also requires FABRIC handle support
|
||||
int cudaDev;
|
||||
int flag = 0;
|
||||
CUdevice currentDev;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
// Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
|
||||
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
|
||||
if (!flag) return 0;
|
||||
// Check that all ranks have initialized the fabric fully
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return 0;
|
||||
}
|
||||
|
||||
// Determine our MNNVL domain/clique
|
||||
NCCLCHECKGOTO(ncclCalloc(&comm->clique.ranks, comm->nRanks), ret, fail);
|
||||
comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId;
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo;
|
||||
nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
|
||||
// Check if the cluster UUID and cliqueId match
|
||||
// A zero UUID means we don't have MNNVL fabric info - disable MNNVL
|
||||
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) goto fail;
|
||||
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
|
||||
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
|
||||
if (i == comm->rank) {
|
||||
comm->cliqueRank = comm->clique.size;
|
||||
}
|
||||
comm->clique.ranks[comm->clique.size++] = i;
|
||||
}
|
||||
}
|
||||
// Determine whether to enable MNNVL or not
|
||||
comm->MNNVL = ncclParamMNNVLEnable() == 2 ? comm->clique.size > 1 : ncclParamMNNVLEnable();
|
||||
INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d ", comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank);
|
||||
|
||||
if (comm->MNNVL) {
|
||||
// Force the CUMEM handle type to be FABRIC for MNNVL
|
||||
ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
}
|
||||
|
||||
return comm->MNNVL;
|
||||
|
||||
fail:
|
||||
if (comm->clique.ranks) free(comm->clique.ranks);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
static int checkMNNVL(struct ncclComm* comm) {
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent = NULL) {
|
||||
// We use 2 AllGathers
|
||||
@@ -842,6 +913,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
float bwInter;
|
||||
int typeIntra;
|
||||
int typeInter;
|
||||
int crossNic;
|
||||
};
|
||||
|
||||
struct allGatherInfo {
|
||||
@@ -875,61 +947,19 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
}
|
||||
// AllGather1 - end
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cudawrap.h"
|
||||
|
||||
// MNNVL support
|
||||
if (nNodes > 1) {
|
||||
int cliqueSize = 0;
|
||||
comm->MNNVL = 0;
|
||||
// Determine the size of the MNNVL domain/clique
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[rank].fabricInfo;
|
||||
nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
|
||||
// Check that the Fabric state is fully initialized
|
||||
if (fabricInfo2->state != NVML_GPU_FABRIC_STATE_COMPLETED) continue;
|
||||
// Check that the cluster UUID and cliqueId match in each rank
|
||||
// A zero UUID means we don't have MNNVL fabric info - disable MNNVL
|
||||
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) continue;
|
||||
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
|
||||
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
|
||||
cliqueSize++;
|
||||
}
|
||||
}
|
||||
// Determine whether this is a MNNVL system
|
||||
comm->MNNVL = ncclParamMNNVL() < 0 ? cliqueSize == comm->nRanks : ncclParamMNNVL();
|
||||
// MNNVL requires cuMem to be enabled
|
||||
if (!ncclCuMemEnable()) comm->MNNVL = 0;
|
||||
if (comm->MNNVL) {
|
||||
// MNNVL also requires FABRIC handle support
|
||||
int cudaDev;
|
||||
int flag = 0;
|
||||
CUdevice currentDev;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
// Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
|
||||
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
|
||||
if (!flag)
|
||||
comm->MNNVL = 0;
|
||||
else
|
||||
// Force the handle type to be FABRIC for MNNVL
|
||||
ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
}
|
||||
if (ncclParamMNNVL() == 1 && !comm->MNNVL) {
|
||||
WARN("MNNVL is not supported on this system");
|
||||
ret = ncclSystemError;
|
||||
goto fail;
|
||||
}
|
||||
if (nNodes > 1 && !checkMNNVL(comm) && ncclParamMNNVLEnable() == 1) {
|
||||
// Return an error if the user specifically requested MNNVL support
|
||||
WARN("MNNVL is not supported on this system");
|
||||
ret = ncclSystemError;
|
||||
goto fail;
|
||||
}
|
||||
#endif
|
||||
|
||||
do {
|
||||
// Compute intra-process ranks
|
||||
int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
|
||||
for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[rank].cudaCompCap);
|
||||
for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[rank].cudaCompCap);
|
||||
for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[i].cudaCompCap);
|
||||
for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[i].cudaCompCap);
|
||||
|
||||
comm->nvlsRegSupport = 1;
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
@@ -955,6 +985,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Buffer Registration is not supported with MNNVL
|
||||
if (comm->MNNVL) comm->nvlsRegSupport = 0;
|
||||
|
||||
TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
|
||||
rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
|
||||
if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
|
||||
@@ -1065,6 +1099,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
|
||||
allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
|
||||
allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
|
||||
allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
|
||||
}
|
||||
|
||||
comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
|
||||
@@ -1137,10 +1172,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
|
||||
graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
|
||||
graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
|
||||
graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
|
||||
}
|
||||
if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
|
||||
if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = 0;
|
||||
}
|
||||
if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
|
||||
if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
|
||||
|
||||
comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
|
||||
if (comm->nChannels < nChannelsOrig) {
|
||||
@@ -1156,17 +1192,22 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
|
||||
comm->collNetSupport = 0;
|
||||
}
|
||||
comm->collNetRegSupport = true;
|
||||
for (int n=0; n<comm->nNodes; n++) {
|
||||
if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
|
||||
WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
|
||||
comm->collNetSupport = 0;
|
||||
break;
|
||||
}
|
||||
if (comm->nodeRanks[n].localRanks > 1) {
|
||||
// As long as there is more than 1 rank on any node, we need to disable collnet reg
|
||||
comm->collNetRegSupport = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
|
||||
// AllGather3 - end
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
|
||||
@@ -1253,7 +1294,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
// Compute time models for algorithm and protocol combinations
|
||||
NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
|
||||
|
||||
INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->collNetChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
||||
INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
||||
|
||||
do { // Setup p2p structures in comm->tasks
|
||||
struct ncclTasks* tasks = &comm->tasks;
|
||||
@@ -1360,7 +1401,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
|
||||
|
||||
/* Local intra-node barrier */
|
||||
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
|
||||
|
||||
// We should have allocated all buffers, collective fifos, ... we can
|
||||
// restore the affinity.
|
||||
@@ -1496,13 +1537,19 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
comm->cudaArch = cudaArch;
|
||||
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES);
|
||||
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
|
||||
if (job->parent) {
|
||||
INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init START",
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId));
|
||||
} else {
|
||||
INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START",
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail);
|
||||
|
||||
NCCLCHECKGOTO(ncclLoadTunerPlugin(&comm->tuner), res, fail);
|
||||
NCCLCHECKGOTO(ncclTunerPluginLoad(&comm->tuner), res, fail);
|
||||
if (comm->tuner) {
|
||||
NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog));
|
||||
NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog, &comm->tunerContext));
|
||||
}
|
||||
|
||||
// update communicator state
|
||||
@@ -1519,8 +1566,13 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
comm, comm->nRanks, (unsigned long long)hashUniqueId(job->commId), comm->rank, comm->cudaDev);
|
||||
}
|
||||
|
||||
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
|
||||
if (job->parent) {
|
||||
INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init COMPLETE",
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId));
|
||||
} else {
|
||||
INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE",
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
|
||||
}
|
||||
exit:
|
||||
if (job->newcomm) {
|
||||
/* assign it to user pointer. */
|
||||
@@ -1729,6 +1781,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&comm, 1), res, fail);
|
||||
comm->startMagic = comm->endMagic = NCCL_MAGIC; // Used to detect comm corruption.
|
||||
NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1), res, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc((uint32_t**)&comm->abortFlagRefCount, 1), res, fail);
|
||||
*comm->abortFlagRefCount = 1;
|
||||
@@ -1926,8 +1979,8 @@ static ncclResult_t commCleanup(ncclComm_t comm) {
|
||||
}
|
||||
|
||||
if (comm->tuner != NULL) {
|
||||
NCCLCHECK(comm->tuner->destroy());
|
||||
NCCLCHECK(ncclCloseTunerPlugin(&comm->tuner));
|
||||
NCCLCHECK(comm->tuner->destroy(comm->tunerContext));
|
||||
NCCLCHECK(ncclTunerPluginUnload(&comm->tuner));
|
||||
}
|
||||
|
||||
NCCLCHECK(commFree(comm));
|
||||
@@ -2142,7 +2195,7 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
|
||||
ncclResult_t res = ncclSuccess;
|
||||
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
NCCLCHECKGOTO(PtrCheck(comm, "CommSplit", "comm"), res, fail);
|
||||
NCCLCHECKGOTO(CommCheck(comm, "CommSplit", "comm"), res, fail);
|
||||
NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail);
|
||||
NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail);
|
||||
|
||||
@@ -2152,6 +2205,7 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
|
||||
INFO(NCCL_INIT, "Rank %d has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator", comm->rank);
|
||||
} else {
|
||||
NCCLCHECKGOTO(ncclCalloc(&childComm, 1), res, fail);
|
||||
childComm->startMagic = childComm->endMagic = NCCL_MAGIC;
|
||||
if (comm->config.splitShare) {
|
||||
childComm->abortFlag = comm->abortFlag;
|
||||
childComm->abortFlagRefCount = comm->abortFlagRefCount;
|
||||
@@ -2224,7 +2278,7 @@ const char* ncclGetLastError(ncclComm_t comm) {
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
|
||||
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
|
||||
NCCLCHECK(CommCheck(comm, "ncclGetAsyncError", "comm"));
|
||||
NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
|
||||
|
||||
*asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
|
||||
@@ -2236,7 +2290,7 @@ NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
|
||||
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
|
||||
NCCLCHECK(CommCheck(comm, "CommCount", "comm"));
|
||||
NCCLCHECK(PtrCheck(count, "CommCount", "count"));
|
||||
|
||||
/* init thread must be joined before we access the attributes of comm. */
|
||||
@@ -2250,7 +2304,7 @@ NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
|
||||
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
|
||||
NCCLCHECK(CommCheck(comm, "CommCuDevice", "comm"));
|
||||
NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
|
||||
|
||||
NCCLCHECK(ncclCommEnsureReady(comm));
|
||||
@@ -2263,7 +2317,7 @@ NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
|
||||
NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
|
||||
NCCLCHECK(CommCheck(comm, "CommUserRank", "comm"));
|
||||
NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
|
||||
|
||||
NCCLCHECK(ncclCommEnsureReady(comm));
|
||||
@@ -2302,7 +2356,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
|
||||
if (mcSupport) {
|
||||
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
memprop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE;
|
||||
memprop.requestedHandleTypes = ncclCuMemHandleType;
|
||||
memprop.location.id = currentDev;
|
||||
// Query device to see if RDMA support is available
|
||||
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
|
||||
@@ -2314,7 +2368,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
|
||||
mcprop.size = size;
|
||||
/* device cnt is a dummy value right now, it might affect mc granularity in the future. */
|
||||
mcprop.numDevices = dcnt;
|
||||
mcprop.handleTypes = NVLS_CU_MEM_HANDLE_TYPE;
|
||||
mcprop.handleTypes = ncclCuMemHandleType;
|
||||
mcprop.flags = 0;
|
||||
CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
|
||||
|
||||
|
||||
@@ -33,6 +33,15 @@ ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t CommCheck(struct ncclComm* comm, const char* opname, const char* ptrname) {
|
||||
NCCLCHECK(PtrCheck(comm, opname, ptrname));
|
||||
if (comm->startMagic != NCCL_MAGIC || comm->endMagic != NCCL_MAGIC) {
|
||||
WARN("Error: corrupted comm object detected");
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ArgsCheck(struct ncclInfo* info) {
|
||||
// First, the easy ones
|
||||
if (info->root < 0 || info->root >= info->comm->nRanks) {
|
||||
|
||||
+85
-128
@@ -9,8 +9,6 @@
|
||||
#include "param.h"
|
||||
#include "cudawrap.h"
|
||||
|
||||
#include <dlfcn.h>
|
||||
|
||||
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
|
||||
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
|
||||
|
||||
@@ -51,112 +49,119 @@ int ncclCuMemEnable() {
|
||||
return param >= 0 ? param : (param == -2 && ncclCuMemSupported);
|
||||
}
|
||||
|
||||
#define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
|
||||
#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
|
||||
DECLARE_CUDA_PFN(cuDeviceGet, 2000);
|
||||
DECLARE_CUDA_PFN(cuDeviceGetAttribute, 2000);
|
||||
DECLARE_CUDA_PFN(cuGetErrorString, 6000);
|
||||
DECLARE_CUDA_PFN(cuGetErrorName, 6000);
|
||||
DECLARE_CUDA_PFN(cuDeviceGet);
|
||||
DECLARE_CUDA_PFN(cuDeviceGetAttribute);
|
||||
DECLARE_CUDA_PFN(cuGetErrorString);
|
||||
DECLARE_CUDA_PFN(cuGetErrorName);
|
||||
/* enqueue.cc */
|
||||
DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020);
|
||||
DECLARE_CUDA_PFN(cuMemGetAddressRange);
|
||||
/* proxy.cc */
|
||||
DECLARE_CUDA_PFN(cuCtxCreate, 3020);
|
||||
DECLARE_CUDA_PFN(cuCtxDestroy, 4000);
|
||||
DECLARE_CUDA_PFN(cuCtxGetCurrent, 4000);
|
||||
DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000);
|
||||
DECLARE_CUDA_PFN(cuCtxGetDevice, 2000);
|
||||
DECLARE_CUDA_PFN(cuCtxCreate);
|
||||
DECLARE_CUDA_PFN(cuCtxDestroy);
|
||||
DECLARE_CUDA_PFN(cuCtxGetCurrent);
|
||||
DECLARE_CUDA_PFN(cuCtxSetCurrent);
|
||||
DECLARE_CUDA_PFN(cuCtxGetDevice);
|
||||
/* cuMem API support */
|
||||
DECLARE_CUDA_PFN(cuMemAddressReserve, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemAddressFree, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemCreate, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemGetAllocationGranularity, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemMap, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemRelease, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000);
|
||||
DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemUnmap, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemAddressReserve);
|
||||
DECLARE_CUDA_PFN(cuMemAddressFree);
|
||||
DECLARE_CUDA_PFN(cuMemCreate);
|
||||
DECLARE_CUDA_PFN(cuMemGetAllocationGranularity);
|
||||
DECLARE_CUDA_PFN(cuMemExportToShareableHandle);
|
||||
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle);
|
||||
DECLARE_CUDA_PFN(cuMemMap);
|
||||
DECLARE_CUDA_PFN(cuMemRelease);
|
||||
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle);
|
||||
DECLARE_CUDA_PFN(cuMemSetAccess);
|
||||
DECLARE_CUDA_PFN(cuMemUnmap);
|
||||
/* ncclMemAlloc/Free */
|
||||
DECLARE_CUDA_PFN(cuPointerGetAttribute, 4000);
|
||||
DECLARE_CUDA_PFN(cuPointerGetAttribute);
|
||||
#if CUDA_VERSION >= 11070
|
||||
/* transport/collNet.cc/net.cc*/
|
||||
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
|
||||
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support
|
||||
#endif
|
||||
#if CUDA_VERSION >= 12010
|
||||
/* NVSwitch Multicast support */
|
||||
DECLARE_CUDA_PFN(cuMulticastAddDevice, 12010);
|
||||
DECLARE_CUDA_PFN(cuMulticastBindMem, 12010);
|
||||
DECLARE_CUDA_PFN(cuMulticastBindAddr, 12010);
|
||||
DECLARE_CUDA_PFN(cuMulticastCreate, 12010);
|
||||
DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010);
|
||||
DECLARE_CUDA_PFN(cuMulticastUnbind, 12010);
|
||||
DECLARE_CUDA_PFN(cuMulticastAddDevice);
|
||||
DECLARE_CUDA_PFN(cuMulticastBindMem);
|
||||
DECLARE_CUDA_PFN(cuMulticastBindAddr);
|
||||
DECLARE_CUDA_PFN(cuMulticastCreate);
|
||||
DECLARE_CUDA_PFN(cuMulticastGetGranularity);
|
||||
DECLARE_CUDA_PFN(cuMulticastUnbind);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* CUDA Driver functions loaded with dlsym() */
|
||||
DECLARE_CUDA_PFN(cuInit, 2000);
|
||||
DECLARE_CUDA_PFN(cuDriverGetVersion, 2020);
|
||||
DECLARE_CUDA_PFN(cuGetProcAddress, 11030);
|
||||
|
||||
#define CUDA_DRIVER_MIN_VERSION 11030
|
||||
|
||||
static void *cudaLib;
|
||||
int ncclCudaDriverVersionCache = -1;
|
||||
bool ncclCudaLaunchBlocking = false;
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
|
||||
#if CUDART_VERSION >= 12000
|
||||
#define LOAD_SYM(symbol, ignore) do { \
|
||||
cudaDriverEntryPointQueryResult driverStatus; \
|
||||
res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault, &driverStatus); \
|
||||
if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \
|
||||
if (!ignore) { \
|
||||
WARN("Retrieve %s failed with %d status %d", #symbol, res, driverStatus); \
|
||||
return ncclSystemError; } \
|
||||
} } while(0)
|
||||
#else
|
||||
#define LOAD_SYM(symbol, ignore) do { \
|
||||
res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault); \
|
||||
if (res != cudaSuccess) { \
|
||||
if (!ignore) { \
|
||||
WARN("Retrieve %s failed with %d", #symbol, res); \
|
||||
return ncclSystemError; } \
|
||||
} } while(0)
|
||||
#endif
|
||||
|
||||
/*
|
||||
Load the CUDA symbols
|
||||
*/
|
||||
static ncclResult_t cudaPfnFuncLoader(void) {
|
||||
CUresult res;
|
||||
|
||||
#define LOAD_SYM(symbol, version, ignore) do { \
|
||||
res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), version, 0); \
|
||||
if (res != 0) { \
|
||||
if (!ignore) { \
|
||||
WARN("Retrieve %s version %d failed with %d", #symbol, version, res); \
|
||||
return ncclSystemError; } \
|
||||
} } while(0)
|
||||
cudaError_t res;
|
||||
|
||||
LOAD_SYM(cuGetErrorString, 6000, 0);
|
||||
LOAD_SYM(cuGetErrorName, 6000, 0);
|
||||
LOAD_SYM(cuDeviceGet, 2000, 0);
|
||||
LOAD_SYM(cuDeviceGetAttribute, 2000, 0);
|
||||
LOAD_SYM(cuMemGetAddressRange, 3020, 1);
|
||||
LOAD_SYM(cuCtxCreate, 3020, 1);
|
||||
LOAD_SYM(cuCtxDestroy, 4000, 1);
|
||||
LOAD_SYM(cuCtxGetCurrent, 4000, 1);
|
||||
LOAD_SYM(cuCtxSetCurrent, 4000, 1);
|
||||
LOAD_SYM(cuCtxGetDevice, 2000, 1);
|
||||
LOAD_SYM(cuGetErrorString, 0);
|
||||
LOAD_SYM(cuGetErrorName, 0);
|
||||
LOAD_SYM(cuDeviceGet, 0);
|
||||
LOAD_SYM(cuDeviceGetAttribute, 0);
|
||||
LOAD_SYM(cuMemGetAddressRange, 1);
|
||||
LOAD_SYM(cuCtxCreate, 1);
|
||||
LOAD_SYM(cuCtxDestroy, 1);
|
||||
LOAD_SYM(cuCtxGetCurrent, 1);
|
||||
LOAD_SYM(cuCtxSetCurrent, 1);
|
||||
LOAD_SYM(cuCtxGetDevice, 1);
|
||||
/* cuMem API support */
|
||||
LOAD_SYM(cuMemAddressReserve, 10020, 1);
|
||||
LOAD_SYM(cuMemAddressFree, 10020, 1);
|
||||
LOAD_SYM(cuMemCreate, 10020, 1);
|
||||
LOAD_SYM(cuMemGetAllocationGranularity, 10020, 1);
|
||||
LOAD_SYM(cuMemExportToShareableHandle, 10020, 1);
|
||||
LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1);
|
||||
LOAD_SYM(cuMemMap, 10020, 1);
|
||||
LOAD_SYM(cuMemRelease, 10020, 1);
|
||||
LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1);
|
||||
LOAD_SYM(cuMemSetAccess, 10020, 1);
|
||||
LOAD_SYM(cuMemUnmap, 10020, 1);
|
||||
LOAD_SYM(cuMemAddressReserve, 1);
|
||||
LOAD_SYM(cuMemAddressFree, 1);
|
||||
LOAD_SYM(cuMemCreate, 1);
|
||||
LOAD_SYM(cuMemGetAllocationGranularity, 1);
|
||||
LOAD_SYM(cuMemExportToShareableHandle, 1);
|
||||
LOAD_SYM(cuMemImportFromShareableHandle, 1);
|
||||
LOAD_SYM(cuMemMap, 1);
|
||||
LOAD_SYM(cuMemRelease, 1);
|
||||
LOAD_SYM(cuMemRetainAllocationHandle, 1);
|
||||
LOAD_SYM(cuMemSetAccess, 1);
|
||||
LOAD_SYM(cuMemUnmap, 1);
|
||||
/* ncclMemAlloc/Free */
|
||||
LOAD_SYM(cuPointerGetAttribute, 4000, 1);
|
||||
LOAD_SYM(cuPointerGetAttribute, 1);
|
||||
#if CUDA_VERSION >= 11070
|
||||
LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
|
||||
LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support
|
||||
#endif
|
||||
#if CUDA_VERSION >= 12010
|
||||
/* NVSwitch Multicast support */
|
||||
LOAD_SYM(cuMulticastAddDevice, 12010, 1);
|
||||
LOAD_SYM(cuMulticastBindMem, 12010, 1);
|
||||
LOAD_SYM(cuMulticastBindAddr, 12010, 1);
|
||||
LOAD_SYM(cuMulticastCreate, 12010, 1);
|
||||
LOAD_SYM(cuMulticastGetGranularity, 12010, 1);
|
||||
LOAD_SYM(cuMulticastUnbind, 12010, 1);
|
||||
LOAD_SYM(cuMulticastAddDevice, 1);
|
||||
LOAD_SYM(cuMulticastBindMem, 1);
|
||||
LOAD_SYM(cuMulticastBindAddr, 1);
|
||||
LOAD_SYM(cuMulticastCreate, 1);
|
||||
LOAD_SYM(cuMulticastGetGranularity, 1);
|
||||
LOAD_SYM(cuMulticastUnbind, 1);
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -171,47 +176,12 @@ static void initOnceFunc() {
|
||||
ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
|
||||
} while (0);
|
||||
|
||||
CUresult res;
|
||||
/*
|
||||
* Load CUDA driver library
|
||||
*/
|
||||
char path[1024];
|
||||
const char *ncclCudaPath = ncclGetEnv("NCCL_CUDA_PATH");
|
||||
if (ncclCudaPath == NULL)
|
||||
snprintf(path, 1024, "%s", "libcuda.so");
|
||||
else
|
||||
snprintf(path, 1024, "%s/%s", ncclCudaPath, "libcuda.so");
|
||||
|
||||
(void) dlerror(); // Clear any previous errors
|
||||
cudaLib = dlopen(path, RTLD_LAZY);
|
||||
if (cudaLib == NULL) {
|
||||
WARN("Failed to find CUDA library %s (NCCL_CUDA_PATH='%s') : %s", path, ncclCudaPath ? ncclCudaPath : "", dlerror());
|
||||
goto error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Load initial CUDA functions
|
||||
*/
|
||||
|
||||
pfn_cuInit = (PFN_cuInit_v2000) dlsym(cudaLib, "cuInit");
|
||||
if (pfn_cuInit == NULL) {
|
||||
WARN("Failed to load CUDA missing symbol cuInit");
|
||||
goto error;
|
||||
}
|
||||
|
||||
pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion_v2020) dlsym(cudaLib, "cuDriverGetVersion");
|
||||
if (pfn_cuDriverGetVersion == NULL) {
|
||||
WARN("Failed to load CUDA missing symbol cuDriverGetVersion");
|
||||
goto error;
|
||||
}
|
||||
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int cudaDev;
|
||||
int driverVersion;
|
||||
res = pfn_cuDriverGetVersion(&driverVersion);
|
||||
if (res != 0) {
|
||||
WARN("cuDriverGetVersion failed with %d", res);
|
||||
goto error;
|
||||
}
|
||||
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error); // Initialize the driver
|
||||
|
||||
CUDACHECKGOTO(cudaDriverGetVersion(&driverVersion), ret, error);
|
||||
INFO(NCCL_INIT, "cudaDriverVersion %d", driverVersion);
|
||||
|
||||
if (driverVersion < CUDA_DRIVER_MIN_VERSION) {
|
||||
@@ -220,19 +190,6 @@ static void initOnceFunc() {
|
||||
goto error;
|
||||
}
|
||||
|
||||
pfn_cuGetProcAddress = (PFN_cuGetProcAddress_v11030) dlsym(cudaLib, "cuGetProcAddress");
|
||||
if (pfn_cuGetProcAddress == NULL) {
|
||||
WARN("Failed to load CUDA missing symbol cuGetProcAddress");
|
||||
goto error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Required to initialize the CUDA Driver.
|
||||
* Multiple calls of cuInit() will return immediately
|
||||
* without making any relevant change
|
||||
*/
|
||||
pfn_cuInit(0);
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
if (cudaPfnFuncLoader()) {
|
||||
WARN("CUDA some PFN functions not found in the library");
|
||||
@@ -243,7 +200,7 @@ static void initOnceFunc() {
|
||||
// Determine whether we support the cuMem APIs or not
|
||||
ncclCuMemSupported = ncclIsCuMemSupported();
|
||||
|
||||
initResult = ncclSuccess;
|
||||
initResult = ret;
|
||||
return;
|
||||
error:
|
||||
initResult = ncclSystemError;
|
||||
|
||||
@@ -790,6 +790,24 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize) {
|
||||
int sendOffset = 0, recvOffset = 0;
|
||||
if (sendSock == NULL || recvSock == NULL) {
|
||||
WARN("ncclSocketSendRecv: invalid socket %p/%p", sendSock, recvSock);
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (sendSock->state != ncclSocketStateReady || recvSock->state != ncclSocketStateReady) {
|
||||
WARN("ncclSocketSendRecv: socket state (%d/%d) is not ready", sendSock->state, recvSock->state);
|
||||
return ncclInternalError;
|
||||
}
|
||||
while (sendOffset < sendSize || recvOffset < recvSize) {
|
||||
if (sendOffset < sendSize) NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sendSock, sendPtr, sendSize, &sendOffset));
|
||||
if (recvOffset < recvSize) NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, recvSock, recvPtr, recvSize, &recvOffset));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
// Receive or detect connection closed
|
||||
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) {
|
||||
int offset = 0;
|
||||
|
||||
+160
-51
@@ -13,69 +13,178 @@
|
||||
#include "nccl_tuner.h"
|
||||
|
||||
pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static int tunerPluginRefCount = -1;
|
||||
static int tunerPluginRefCount;
|
||||
static void* tunerPluginLib = nullptr;
|
||||
ncclTuner_t* tunerSymbol = nullptr;
|
||||
|
||||
ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner) {
|
||||
// Initialize to nullptr by default if plugin tuner cannot be loaded.
|
||||
*tuner = nullptr;
|
||||
if (tunerPluginRefCount == -2) return ncclSuccess;
|
||||
|
||||
pthread_mutex_lock(&tunerPluginLock);
|
||||
if (tunerPluginRefCount == -1) {
|
||||
tunerPluginRefCount = -2; // Default: no plugin, don't try again later
|
||||
|
||||
const char* name = getenv("NCCL_TUNER_PLUGIN");
|
||||
if (name) {
|
||||
INFO(NCCL_TUNING, "NCCL_TUNER_PLUGIN set to %s", name);
|
||||
tunerPluginLib = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
|
||||
if (tunerPluginLib == nullptr) {
|
||||
// dlopen does not guarantee to set errno, but dlerror only gives us a
|
||||
// string, so checking errno doesn't hurt to try to provide a better
|
||||
// error message
|
||||
if (errno == ENOENT) {
|
||||
INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name);
|
||||
} else {
|
||||
INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror());
|
||||
}
|
||||
} else {
|
||||
tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
|
||||
if (tunerSymbol == nullptr) {
|
||||
INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name);
|
||||
dlclose(tunerPluginLib);
|
||||
tunerPluginLib = nullptr;
|
||||
} else {
|
||||
INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name);
|
||||
tunerPluginRefCount = 0;
|
||||
}
|
||||
}
|
||||
static void* tryOpenDynamicLib(const char* name) {
|
||||
if (nullptr == name || strlen(name) == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
void *handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
|
||||
if (nullptr == handle) {
|
||||
if (ENOENT == errno) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: No plugin found (%s)", name);
|
||||
}
|
||||
}
|
||||
|
||||
if (tunerPluginRefCount >= 0) {
|
||||
*tuner = tunerSymbol;
|
||||
INFO(NCCL_INIT, "Using tuner plugin: '%s'", tunerSymbol->name);
|
||||
tunerPluginRefCount++;
|
||||
}
|
||||
pthread_mutex_unlock(&tunerPluginLock);
|
||||
return ncclSuccess;
|
||||
return handle;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner) {
|
||||
static void summarizeOpenTunerPluginLibErrors(char* pluginNames) {
|
||||
const char *separator = " ";
|
||||
int len = strlen(pluginNames);
|
||||
// remove tail separator
|
||||
pluginNames[len - 1] = '\0';
|
||||
|
||||
// remove last plugin name
|
||||
while (len > 0 && pluginNames[--len] != *separator);
|
||||
if (len > 0) {
|
||||
pluginNames[len] = '\0';
|
||||
}
|
||||
|
||||
// distinguish between one load attempt and multiple attempts
|
||||
if (strstr(pluginNames, separator)) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Most recent plugin load returned %d : %s. All attempts to load '%s' also failed.", errno, dlerror(), pluginNames);
|
||||
} else {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin load returned %d : %s : when loading %s", errno, dlerror(), pluginNames);
|
||||
}
|
||||
}
|
||||
|
||||
static void* openTunerPluginLib(void) {
|
||||
void *pluginLib;
|
||||
|
||||
#define MAX_PLUGIN_LOAD 4
|
||||
|
||||
int len;
|
||||
char tunerPluginLibNameTried[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
|
||||
char *ptr = tunerPluginLibNameTried;
|
||||
char tunerPluginLibName[PATH_MAX];
|
||||
const char *envTunerPluginName = getenv("NCCL_TUNER_PLUGIN");
|
||||
if (envTunerPluginName && strlen(envTunerPluginName)) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: NCCL_TUNER_PLUGIN set to %s", envTunerPluginName);
|
||||
snprintf(tunerPluginLibName, PATH_MAX, "%s", envTunerPluginName);
|
||||
pluginLib = tryOpenDynamicLib(tunerPluginLibName);
|
||||
if (pluginLib) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
|
||||
return pluginLib;
|
||||
}
|
||||
len = PATH_MAX - strlen(ptr);
|
||||
snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
|
||||
|
||||
snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner-%s.so", envTunerPluginName);
|
||||
pluginLib = tryOpenDynamicLib(tunerPluginLibName);
|
||||
if (pluginLib) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
|
||||
return pluginLib;
|
||||
}
|
||||
len = PATH_MAX - strlen(ptr);
|
||||
snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
|
||||
} else {
|
||||
snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner.so");
|
||||
pluginLib = tryOpenDynamicLib(tunerPluginLibName);
|
||||
if (pluginLib) {
|
||||
return pluginLib;
|
||||
}
|
||||
len = PATH_MAX - strlen(ptr);
|
||||
snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
|
||||
}
|
||||
|
||||
const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
|
||||
if (envNetPluginName && strlen(envNetPluginName)) {
|
||||
// Users are allowed to pack tuner into the net plugin
|
||||
snprintf(tunerPluginLibName, PATH_MAX, "%s", envNetPluginName);
|
||||
pluginLib = tryOpenDynamicLib(tunerPluginLibName);
|
||||
if (pluginLib) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
|
||||
return pluginLib;
|
||||
}
|
||||
len = PATH_MAX - strlen(ptr);
|
||||
snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
|
||||
|
||||
snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName);
|
||||
pluginLib = tryOpenDynamicLib(tunerPluginLibName);
|
||||
if (pluginLib) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
|
||||
return pluginLib;
|
||||
}
|
||||
len = PATH_MAX - strlen(ptr);
|
||||
snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
|
||||
} else {
|
||||
snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net.so");
|
||||
pluginLib = tryOpenDynamicLib(tunerPluginLibName);
|
||||
if (pluginLib) {
|
||||
return pluginLib;
|
||||
}
|
||||
len = PATH_MAX - strlen(ptr);
|
||||
snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName);
|
||||
}
|
||||
summarizeOpenTunerPluginLibErrors(ptr);
|
||||
|
||||
tunerPluginLibName[0] = '\0';
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
enum {
|
||||
tunerPluginLoadFailed = -1,
|
||||
tunerPluginLoadReady = 0,
|
||||
tunerPluginLoadSuccess = 1,
|
||||
};
|
||||
|
||||
ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner) {
|
||||
// Initialize to nullptr by default if plugin tuner cannot be loaded.
|
||||
*tuner = nullptr;
|
||||
static int status = tunerPluginLoadReady;
|
||||
if (tunerPluginLoadFailed == status) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&tunerPluginLock);
|
||||
if (tunerPluginLoadFailed == status) {
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (tunerPluginLoadSuccess == status) {
|
||||
*tuner = tunerSymbol;
|
||||
++tunerPluginRefCount;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
tunerPluginLib = openTunerPluginLib();
|
||||
if (nullptr == tunerPluginLib) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin.");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
|
||||
if (tunerSymbol == nullptr) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find " NCCL_TUNER_PLUGIN_SYMBOL ", using internal tuner instead.");
|
||||
dlclose(tunerPluginLib);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", tunerSymbol->name);
|
||||
*tuner = tunerSymbol;
|
||||
++tunerPluginRefCount;
|
||||
status = tunerPluginLoadSuccess;
|
||||
|
||||
exit:
|
||||
pthread_mutex_unlock(&tunerPluginLock);
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
tunerPluginLib = nullptr;
|
||||
status = tunerPluginLoadFailed;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTunerPluginUnload(ncclTuner_t** tuner) {
|
||||
if (*tuner == nullptr) return ncclSuccess;
|
||||
pthread_mutex_lock(&tunerPluginLock);
|
||||
if (--tunerPluginRefCount == 0) {
|
||||
if (tunerPluginLib == nullptr) {
|
||||
WARN("Tuner plugin refcount is 0, yet tunerPluginLib ptr is NULL\n");
|
||||
} else {
|
||||
INFO(NCCL_TUNING, "Closing tuner: '%s'", tunerSymbol->name);
|
||||
dlclose(tunerPluginLib);
|
||||
}
|
||||
if (0 == (--tunerPluginRefCount)) {
|
||||
INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
|
||||
dlclose(tunerPluginLib);
|
||||
tunerPluginLib = nullptr;
|
||||
tunerSymbol = nullptr;
|
||||
*tuner = nullptr;
|
||||
tunerPluginRefCount = -1;
|
||||
}
|
||||
pthread_mutex_unlock(&tunerPluginLock);
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -174,7 +174,6 @@ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
|
||||
|
||||
/* Register CUDA buffer for zero-copy operation */
|
||||
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
@@ -433,14 +432,6 @@ ncclResult_t pncclGroupStart();
|
||||
ncclResult_t ncclGroupEnd();
|
||||
ncclResult_t pncclGroupEnd();
|
||||
|
||||
/* Register CUDA buffer for zero-copy operation */
|
||||
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
|
||||
/* Deregister CUDA buffer */
|
||||
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
|
||||
ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // end extern "C"
|
||||
#endif
|
||||
|
||||
+78
-17
@@ -339,26 +339,87 @@ enum ncclNetState {
|
||||
enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
|
||||
enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
|
||||
|
||||
ncclResult_t ncclNetPluginInit() {
|
||||
char ncclNetPluginName[128];
|
||||
const char* envPluginName = ncclGetEnv("NCCL_NET_PLUGIN");
|
||||
if (envPluginName && strlen(envPluginName)) {
|
||||
snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName);
|
||||
INFO(NCCL_INIT, "Plugin name set by env to %s", ncclNetPluginName);
|
||||
} else {
|
||||
sprintf(ncclNetPluginName, "libnccl-net.so");
|
||||
static void* tryOpenDynamicLib(char* name) {
|
||||
if (nullptr == name || strlen(name) == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
|
||||
if (netPluginLib == nullptr) {
|
||||
// dlopen does not guarantee to set errno, but dlerror only gives us a
|
||||
// string, so checking errno doesn't hurt to try to provide a better
|
||||
// error message
|
||||
if (errno == ENOENT) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : dlerror=%s No plugin found (%s), using internal implementation", dlerror(), ncclNetPluginName);
|
||||
// exit(-1);
|
||||
void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
|
||||
if (nullptr == handle) {
|
||||
if (ENOENT == errno) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: No plugin found (%s)", name);
|
||||
} else {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin load returned %d : %s when loading %s", errno, dlerror(), name);
|
||||
}
|
||||
}
|
||||
return handle;
|
||||
}
|
||||
|
||||
static void summarizeOpenNetPluginErrors(char* pluginNames) {
|
||||
const char *separator = " ";
|
||||
int len = strlen(pluginNames);
|
||||
// remove tail separator
|
||||
pluginNames[len - 1] = '\0';
|
||||
|
||||
// remove last plugin name
|
||||
while (len > 0 && pluginNames[--len] != *separator);
|
||||
if (len > 0) {
|
||||
pluginNames[len] = '\0';
|
||||
}
|
||||
|
||||
// distinguish between one load attempt and multiple attempts
|
||||
if (strstr(pluginNames, separator)) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "NET/Plugin: Most recent plugin load returned %d : %s. All attempts to load '%s' also failed.", errno, dlerror(), pluginNames);
|
||||
} else {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "NET/Plugin: Plugin load returned %d : %s : when loading %s", errno, dlerror(), pluginNames);
|
||||
}
|
||||
}
|
||||
|
||||
static void* openNetPluginLib(void) {
|
||||
void *pluginLib;
|
||||
|
||||
#define MAX_PLUGIN_LOAD 2
|
||||
|
||||
int len;
|
||||
char netPluginLibNameTried[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
|
||||
char *ptr = netPluginLibNameTried;
|
||||
char netPluginLibName[PATH_MAX];
|
||||
const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
|
||||
if (envNetPluginName && strlen(envNetPluginName)) {
|
||||
snprintf(netPluginLibName, PATH_MAX, "%s", envNetPluginName);
|
||||
pluginLib = tryOpenDynamicLib(netPluginLibName);
|
||||
if (pluginLib) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName);
|
||||
return pluginLib;
|
||||
}
|
||||
len = PATH_MAX - strlen(ptr);
|
||||
snprintf(ptr + strlen(ptr), len + 1, "%s ", netPluginLibName);
|
||||
|
||||
snprintf(netPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName);
|
||||
pluginLib = tryOpenDynamicLib(netPluginLibName);
|
||||
if (pluginLib) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName);
|
||||
return pluginLib;
|
||||
}
|
||||
len = PATH_MAX - strlen(ptr);
|
||||
snprintf(ptr + strlen(ptr), len + 1, "%s ", netPluginLibName);
|
||||
} else {
|
||||
snprintf(netPluginLibName, PATH_MAX, "libnccl-net.so");
|
||||
pluginLib = tryOpenDynamicLib(netPluginLibName);
|
||||
if (pluginLib) {
|
||||
return pluginLib;
|
||||
}
|
||||
len = PATH_MAX - strlen(ptr);
|
||||
snprintf(ptr + strlen(ptr), len + 1, "%s ", netPluginLibName);
|
||||
}
|
||||
summarizeOpenNetPluginErrors(ptr);
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetPluginInit() {
|
||||
void* netPluginLib = openNetPluginLib();
|
||||
if (netPluginLib == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Using internal network plugin.");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
+17
-10
@@ -358,9 +358,13 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
|
||||
sub->channelId = op->channelId;
|
||||
sub->nsteps = op->nsteps;
|
||||
sub->nbytes = op->nbytes;
|
||||
sub->offset = 0;
|
||||
sub->peer = op->root;
|
||||
sub->reg = op->reg;
|
||||
sub->buffer = op->buffer;
|
||||
sub->sendMhandle = op->sendMhandle;
|
||||
sub->recvMhandle = op->recvMhandle;
|
||||
sub->sendbuff = op->sendbuff;
|
||||
sub->recvbuff = op->recvbuff;
|
||||
args->nsubs = subIndex+1;
|
||||
if (subIndex) {
|
||||
if ((args->sliceSteps != op->sliceSteps) ||
|
||||
@@ -634,7 +638,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op,
|
||||
if (ncclParamChunkSize() != 0) {
|
||||
info->chunkSize = ncclParamChunkSize();
|
||||
}
|
||||
op->buffer = op->reg ? info->recvbuff : NULL;
|
||||
op->recvbuff = op->reg ? (uint8_t*)info->recvbuff : NULL;
|
||||
op->chunkSize = info->chunkSize;
|
||||
op->nbytes = info->count;
|
||||
|
||||
@@ -820,7 +824,7 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) {
|
||||
if (createThreadContext) {
|
||||
if (proxyState->cudaCtx == NULL) {
|
||||
if (CUPFN(cuCtxCreate(&proxyState->cudaCtx,
|
||||
CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) {
|
||||
NULL, 0, CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) {
|
||||
WARN("Failed to create CUDA context on device %d", proxyState->cudaDev);
|
||||
createThreadContext = 0;
|
||||
}
|
||||
@@ -1083,7 +1087,8 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
|
||||
ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, int tpRank, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int *respFd) {
|
||||
ncclResult_t res = ncclSuccess;
|
||||
struct ncclIpcSocket ipcSock = { 0 };
|
||||
void *opId = (void*)((((uintptr_t)random()) << 32) | random());
|
||||
void *opId;
|
||||
NCCLCHECK(getRandomData(&opId, sizeof(opId)));
|
||||
|
||||
int rank = comm->topParentLocalRanks[comm->localRank];
|
||||
struct ncclProxyState* sharedProxyState = comm->proxyState;
|
||||
@@ -1365,6 +1370,12 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
|
||||
else if (op->type == ncclProxyMsgInit) {
|
||||
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
|
||||
res = proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection);
|
||||
} else if (op->type == ncclProxyMsgRegister) {
|
||||
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgRegister opId=%p op.reqBuff=%p, op->reqSize=%d, op->respSize=%d", op->opId, op->reqBuff, op->reqSize, op->respSize);
|
||||
res = op->connection->tcomm->proxyRegister(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done);
|
||||
} else if (op->type == ncclProxyMsgDeregister) {
|
||||
TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgDeregister opId=%p op.reqBuff=%p, op->reqSize=%d, op->respSize=%d", op->opId, op->reqBuff, op->reqSize, op->respSize);
|
||||
res = op->connection->tcomm->proxyDeregister(op->connection, proxyState, op->reqBuff, op->reqSize, &done);
|
||||
} else return ncclInternalError;
|
||||
|
||||
if (done) {
|
||||
@@ -1435,6 +1446,8 @@ static bool proxyMatchOpType(int type) {
|
||||
case ncclProxyMsgSetup:
|
||||
case ncclProxyMsgConnect:
|
||||
case ncclProxyMsgGetFd:
|
||||
case ncclProxyMsgRegister:
|
||||
case ncclProxyMsgDeregister:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -1663,12 +1676,6 @@ ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union
|
||||
|
||||
// UDS support
|
||||
NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, peerAddressesUDS[comm->rank], comm->abortFlag));
|
||||
// Seed the random number generator for UDS filename generation
|
||||
struct timeval time;
|
||||
gettimeofday(&time,NULL);
|
||||
unsigned int seed = time.tv_sec*time.tv_usec;
|
||||
seed ^= getpid();
|
||||
srandom(seed);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
+6
-3
@@ -34,7 +34,7 @@ ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, str
|
||||
// Find local devices for p2p operations
|
||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
int dev;
|
||||
if (ncclTopoGetLocalNet(comm->topo, comm->rank, c, &dev) != ncclSuccess) goto end; // No local net
|
||||
if (ncclTopoGetLocalNet(comm->topo, comm->rank, c, NULL, &dev) != ncclSuccess) goto end; // No local net
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECKGOTO(comm->ncclNet->getProperties(dev, &props), ret, end);
|
||||
if (props.regIsGlobal == 0) { // We need to be sure all NICs support global registration.
|
||||
@@ -152,7 +152,7 @@ ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
|
||||
NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
|
||||
if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
|
||||
NCCLCHECK(ncclRegister(comm, buff, size, handle));
|
||||
return ncclSuccess;
|
||||
@@ -160,7 +160,7 @@ ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, vo
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
|
||||
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
|
||||
NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
|
||||
NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
|
||||
struct ncclReg* reg = (struct ncclReg*)handle;
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
int slot;
|
||||
@@ -175,6 +175,9 @@ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
|
||||
NCCLCHECK(ncclNvlsDeregBuffer(®->mcHandle, reg->regAddr, reg->dev, reg->regSize));
|
||||
reg->regAddr = (CUdeviceptr)NULL;
|
||||
}
|
||||
if (reg->state & COLLNET_REG_COMPLETE) {
|
||||
NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->proxyconn, reg->collnetHandle));
|
||||
}
|
||||
free(reg);
|
||||
memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
|
||||
cache->population -= 1;
|
||||
|
||||
+7
-19
@@ -229,7 +229,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
* others might still be trying to connect and import the buffer. No sync can lead to invalid
|
||||
* shmem/cuda buffer. In addition, we also clear all connect masks and free each connectInfo array */
|
||||
for (int i = 1; i < comm->nRanks; i++) {
|
||||
int bootstrapTag = (i << 8) + (graph ? graph->id + 1 : 0);
|
||||
int bootstrapTag = (i << 8) + (1 << 7) + (graph ? graph->id + 1 : 0);
|
||||
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
||||
int sendPeer = (comm->rank + i) % comm->nRanks;
|
||||
int flag = 0;
|
||||
@@ -271,27 +271,19 @@ extern struct ncclTransport collNetTransport;
|
||||
|
||||
// All ranks must participate in collNetSetup call
|
||||
// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
|
||||
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type) {
|
||||
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) {
|
||||
int fail = 1;
|
||||
int rank = comm->rank;
|
||||
int nranks = comm->nRanks;
|
||||
int nMasters = comm->nNodes;
|
||||
int rankInCollNet = -1;
|
||||
int isMaster = (rank == masterRank) ? 1 : 0;
|
||||
struct {
|
||||
int collNetRank;
|
||||
ncclConnect connect;
|
||||
} sendrecvExchange;
|
||||
|
||||
// check if we can connect to collnet, whose root is the nranks-th rank
|
||||
struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
|
||||
peerInfo->rank = nranks;
|
||||
|
||||
// send master receives connect info from peer recv master
|
||||
if (isMaster && type == collNetSend) {
|
||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)));
|
||||
rankInCollNet = sendrecvExchange.collNetRank;
|
||||
TRACE(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer);
|
||||
TRACE(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, comm->node, nMasters, masterPeer);
|
||||
}
|
||||
|
||||
// select
|
||||
@@ -327,24 +319,20 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
c++;
|
||||
}
|
||||
}
|
||||
if (isMaster) rankInCollNet = comm->node;
|
||||
} else { // send side : copy in connect info received from peer recv master
|
||||
if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
|
||||
if (isMaster) memcpy(masterConnects+comm->node, connect, sizeof(struct ncclConnect));
|
||||
}
|
||||
// connect
|
||||
if (isMaster) {
|
||||
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
|
||||
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), res, cleanup);
|
||||
struct ncclDevChannelPeer* devRoot;
|
||||
CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup);
|
||||
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type;
|
||||
CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
|
||||
}
|
||||
// recv side sends connect info to send side
|
||||
if (isMaster && type == collNetRecv) {
|
||||
sendrecvExchange.collNetRank = rankInCollNet;
|
||||
memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect));
|
||||
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
|
||||
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
|
||||
memcpy(connect, masterConnects+comm->node, sizeof(struct ncclConnect));
|
||||
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, comm->node, nMasters, masterPeer);
|
||||
}
|
||||
fail = 0;
|
||||
cleanup:
|
||||
|
||||
+252
-34
@@ -9,6 +9,7 @@
|
||||
#include "graph.h"
|
||||
#include "proxy.h"
|
||||
#include "gdrwrap.h"
|
||||
#include "assert.h"
|
||||
|
||||
int64_t ncclParamGdrCopySyncEnable();
|
||||
int64_t ncclParamGdrCopyFlushEnable();
|
||||
@@ -151,8 +152,9 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
struct setupReq req = { 0 };
|
||||
|
||||
int proxyRank, tpProxyRank;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
||||
int64_t netId;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
|
||||
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
|
||||
send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||
@@ -171,8 +173,9 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
struct setupReq req = { 0 };
|
||||
|
||||
int proxyRank, tpProxyRank;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
|
||||
int64_t netId;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
|
||||
recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
// Determine whether we need to flush the GDR buffer on recv or not
|
||||
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
|
||||
@@ -696,8 +699,10 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
|
||||
if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
resources->recvMem->connFifo[buffSlot].offset = calcRegionOffset(args, 0, s, sub->posted, 0);
|
||||
__sync_synchronize();
|
||||
if (sub->reg == 0) {
|
||||
resources->recvMem->connFifo[buffSlot].offset = calcRegionOffset(args, 0, s, sub->posted, 0);
|
||||
__sync_synchronize();
|
||||
}
|
||||
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS));
|
||||
sub->posted += args->sliceSteps;
|
||||
@@ -708,8 +713,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
volatile uint64_t* recvTail = &resources->recvMem->tail;
|
||||
if (connFifo[buffSlot].size != -1 && ((*recvTail > (sub->base+sub->received)))) {
|
||||
if (args->coll != ncclFuncAllReduce) {
|
||||
if ((connFifo[buffSlot].size != -1 || sub->reg) && ((*recvTail > (sub->base+sub->received)))) {
|
||||
if (args->coll != ncclFuncAllReduce && sub->reg == 0) {
|
||||
int sendBeg = calcRegionOffset(args, 0, s, sub->received, 0);
|
||||
int sendEnd = calcRegionOffset(args, 0, s, sub->received, 1);
|
||||
if (sendEnd-sendBeg != connFifo[buffSlot].size) {
|
||||
@@ -740,33 +745,89 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
reqFifo[group][buffSlot].size = recvEnd - recvBeg;
|
||||
size_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
|
||||
|
||||
if (sendBeg==sendEnd && recvBeg==recvEnd) {
|
||||
if (sendBeg==sendEnd && recvBeg==recvEnd && sub->reg == 0) {
|
||||
sub->requests[buffSlot] = nullptr; // trivally finished request
|
||||
} else {
|
||||
if (args->coll == ncclFuncAllReduce) {
|
||||
int count = (sendEnd-sendBeg)/eltSize;
|
||||
NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region+sendBeg, region+recvBeg, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
|
||||
if (sub->reg) {
|
||||
size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
|
||||
int count = (int)(nBytes / eltSize);
|
||||
NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sub->sendbuff, sub->recvbuff, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->sendMhandle, sub->recvMhandle, sub->requests + buffSlot));
|
||||
if (sub->requests[buffSlot]) {
|
||||
sub->nbytes -= nBytes;
|
||||
sub->sendbuff += nBytes;
|
||||
sub->recvbuff += nBytes;
|
||||
}
|
||||
} else {
|
||||
int count = (sendEnd - sendBeg) / eltSize;
|
||||
NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region + sendBeg, region + recvBeg, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests + buffSlot));
|
||||
}
|
||||
} else {
|
||||
sizePerRank = args->specifics.collnetDirect.sizePerRank;
|
||||
if (args->coll == ncclFuncAllGather) {
|
||||
ncclNetSGE_v8_t recvParts;
|
||||
recvParts.mhandle = recvMhandle;
|
||||
recvParts.address = region + recvBeg;
|
||||
recvParts.size = allEnd - allBeg;
|
||||
NCCLCHECK(proxyState->ncclCollNet->iallgather(
|
||||
resources->collNetComm, region+sendBeg, 1, &recvParts,
|
||||
sizePerRank, allBeg, allEnd-allBeg,
|
||||
sendMhandle, sub->requests+buffSlot));
|
||||
if (sub->reg) {
|
||||
size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
|
||||
void *sendbuff;
|
||||
recvParts.mhandle = sub->recvMhandle;
|
||||
recvParts.address = sub->recvbuff;
|
||||
recvParts.size = nBytes;
|
||||
if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) {
|
||||
sendbuff = sub->sendbuff + sub->offset % sizePerRank;
|
||||
} else {
|
||||
sendbuff = sub->sendbuff;
|
||||
}
|
||||
NCCLCHECK(proxyState->ncclCollNet->iallgather(
|
||||
resources->collNetComm, sendbuff, 1, &recvParts,
|
||||
sizePerRank, sub->offset, nBytes,
|
||||
sub->sendMhandle, sub->requests + buffSlot));
|
||||
if (sub->requests[buffSlot]) {
|
||||
sub->recvbuff += nBytes;
|
||||
sub->nbytes -= nBytes;
|
||||
sub->offset += nBytes;
|
||||
}
|
||||
} else {
|
||||
recvParts.mhandle = recvMhandle;
|
||||
recvParts.address = region + recvBeg;
|
||||
recvParts.size = allEnd - allBeg;
|
||||
NCCLCHECK(proxyState->ncclCollNet->iallgather(
|
||||
resources->collNetComm, region + sendBeg, 1, &recvParts,
|
||||
sizePerRank, allBeg, allEnd - allBeg,
|
||||
sendMhandle, sub->requests + buffSlot));
|
||||
}
|
||||
} else {
|
||||
ncclNetSGE_v8_t sendParts;
|
||||
sendParts.mhandle = sendMhandle;
|
||||
sendParts.address = region + sendBeg;
|
||||
sendParts.size = allEnd - allBeg;
|
||||
NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
|
||||
resources->collNetComm, 1, &sendParts, region+recvBeg,
|
||||
sizePerRank, allBeg, allEnd-allBeg,
|
||||
(ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
|
||||
recvMhandle, sub->requests+buffSlot));
|
||||
if (sub->reg) {
|
||||
size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
|
||||
void *recvbuff;
|
||||
sendParts.mhandle = sub->sendMhandle;
|
||||
sendParts.address = sub->sendbuff;
|
||||
sendParts.size = nBytes;
|
||||
if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) {
|
||||
recvbuff = sub->recvbuff + sub->offset % sizePerRank;
|
||||
} else {
|
||||
recvbuff = sub->recvbuff;
|
||||
}
|
||||
NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
|
||||
resources->collNetComm, 1, &sendParts, recvbuff,
|
||||
sizePerRank, sub->offset, nBytes,
|
||||
(ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
|
||||
sub->recvMhandle, sub->requests + buffSlot));
|
||||
if (sub->requests[buffSlot]) {
|
||||
sub->sendbuff += nBytes;
|
||||
sub->nbytes -= nBytes;
|
||||
sub->offset += nBytes;
|
||||
}
|
||||
} else {
|
||||
sendParts.mhandle = sendMhandle;
|
||||
sendParts.address = region + sendBeg;
|
||||
sendParts.size = allEnd - allBeg;
|
||||
NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
|
||||
resources->collNetComm, 1, &sendParts, region + recvBeg,
|
||||
sizePerRank, allBeg, allEnd - allBeg,
|
||||
(ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
|
||||
recvMhandle, sub->requests + buffSlot));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (sub->requests[buffSlot] == nullptr) continue;
|
||||
@@ -854,7 +915,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
int totalSize = recvEnd - recvBeg;
|
||||
TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %d chunkSize=%d", (long)sub->received, group, buffSlot, totalSize, args->chunkSize);
|
||||
sub->received += args->sliceSteps;
|
||||
if (reqFifo[group][buffSlot].size > 0 && resources->useGdr && resources->needFlush) {
|
||||
if ((reqFifo[group][buffSlot].size > 0 || sub->reg) && resources->useGdr && resources->needFlush) {
|
||||
// GDRCOPY support
|
||||
if (resources->gdcFlush) {
|
||||
#if defined (__x86_64__)
|
||||
@@ -865,7 +926,37 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
return ncclInternalError;
|
||||
#endif
|
||||
} else {
|
||||
NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region+recvBeg, totalSize, mhandle, sub->requests+buffSlot));
|
||||
if (sub->reg) {
|
||||
size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
|
||||
size_t offset = 0;
|
||||
if (args->coll == ncclFuncReduceScatter) {
|
||||
size_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
|
||||
int node = args->specifics.collnetDirect.node;
|
||||
int startNode = sub->offset / sizePerRank;
|
||||
int lastNode = (sub->offset + nBytes) / sizePerRank;
|
||||
if (startNode == node) {
|
||||
offset = sub->offset % sizePerRank;
|
||||
nBytes = std::min(sizePerRank - offset, nBytes);
|
||||
} else if (startNode < node && node < lastNode) {
|
||||
nBytes = sizePerRank;
|
||||
} else if (node == lastNode) {
|
||||
nBytes = (sub->offset + nBytes) % sizePerRank;
|
||||
} else {
|
||||
// no need to flush
|
||||
nBytes = 0;
|
||||
}
|
||||
}
|
||||
NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, sub->recvbuff + offset, nBytes, sub->recvMhandle, sub->requests+buffSlot));
|
||||
if (sub->requests[buffSlot]) {
|
||||
sub->nbytes -= nBytes;
|
||||
sub->offset += nBytes;
|
||||
if (args->coll == ncclFuncAllGather || args->coll == ncclFuncAllReduce) {
|
||||
sub->recvbuff += nBytes;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region+recvBeg, totalSize, mhandle, sub->requests+buffSlot));
|
||||
}
|
||||
}
|
||||
}
|
||||
args->idle = 0;
|
||||
@@ -886,10 +977,12 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
}
|
||||
}
|
||||
if (sub->transmitted < sub->flushed) {
|
||||
int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
connFifo[buffSlot].offset = calcRegionOffset(args, 1, s, sub->transmitted, 0);
|
||||
__sync_synchronize();
|
||||
if (sub->reg == 0) {
|
||||
int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
connFifo[buffSlot].offset = calcRegionOffset(args, 1, s, sub->transmitted, 0);
|
||||
__sync_synchronize();
|
||||
}
|
||||
volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
|
||||
*recvTail = sub->base + sub->flushed;
|
||||
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
|
||||
@@ -916,9 +1009,134 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct collnetRegInfo {
|
||||
uintptr_t buffer;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclReg *regRecord = NULL;
|
||||
|
||||
*outRegBufFlag = 0;
|
||||
*outHandle = NULL;
|
||||
if (comm && userbuff && buffSize > 0) {
|
||||
NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, ®Record), ret, fail);
|
||||
if (regRecord) {
|
||||
if (regRecord->state & COLLNET_REG_COMPLETE) {
|
||||
// reuse previous registration
|
||||
*outRegBufFlag = 2;
|
||||
*outHandle = regRecord->collnetHandle;
|
||||
goto exit;
|
||||
} else {
|
||||
/* start register collnet buffer */
|
||||
struct collnetRegInfo info = {regRecord->addr, regRecord->pages * comm->regCache.pageSize};
|
||||
void* handle = NULL;
|
||||
struct ncclProxyConnector* proxyconn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
|
||||
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
|
||||
if (handle) {
|
||||
regRecord->state |= COLLNET_REG_COMPLETE;
|
||||
regRecord->proxyconn = proxyconn;
|
||||
*outHandle = regRecord->collnetHandle = handle;
|
||||
*outRegBufFlag = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
*outRegBufFlag = 0;
|
||||
*outHandle = NULL;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKernelPlan *plan, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
void* handle = NULL;
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
uintptr_t pageSize = cache->pageSize;
|
||||
uintptr_t addr = (uintptr_t)userbuff & -pageSize;
|
||||
size_t size = DIVUP((uintptr_t)userbuff - addr + buffSize, pageSize) * pageSize;
|
||||
collnetRegInfo info = {addr, size};
|
||||
struct ncclCollnetHandleList* record = NULL;
|
||||
struct ncclProxyConnector* proxyConn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
|
||||
|
||||
*outRegBufFlag = 0;
|
||||
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
|
||||
record = ncclMemoryPoolAlloc<struct ncclCollnetHandleList>(&comm->memPool_ncclCollnetHandleList, &comm->memPermanent);
|
||||
record->proxyconn = proxyConn;
|
||||
record->buffer = userbuff;
|
||||
record->size = buffSize;
|
||||
*outHandle = record->collnetHandle = handle;
|
||||
*outRegBufFlag = 1;
|
||||
ncclIntruQueueEnqueue(&plan->collnetHandleQueue, record);
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
*outRegBufFlag = 0;
|
||||
*outHandle = NULL;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle) {
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgDeregister, &handle, sizeof(void*), NULL, 0));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
void* handle;
|
||||
struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff;
|
||||
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
|
||||
|
||||
assert(reqSize == sizeof(struct collnetRegInfo));
|
||||
assert(respSize == sizeof(void*));
|
||||
if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL;
|
||||
memcpy(respBuff, (void*)&handle, sizeof(void*));
|
||||
*done = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
void* handle;
|
||||
struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff;
|
||||
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
|
||||
|
||||
assert(reqSize == sizeof(struct collnetRegInfo));
|
||||
assert(respSize == sizeof(void*));
|
||||
if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL;
|
||||
memcpy(respBuff, (void*)&handle, sizeof(void*));
|
||||
*done = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
|
||||
void* handle;
|
||||
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
|
||||
|
||||
assert(reqSize == sizeof(void*));
|
||||
memcpy(&handle, reqBuff, sizeof(void*));
|
||||
NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, handle));
|
||||
*done = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
|
||||
void* handle;
|
||||
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
|
||||
|
||||
assert(reqSize == sizeof(void*));
|
||||
memcpy(&handle, reqBuff, sizeof(void*));
|
||||
NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, handle));
|
||||
*done = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclTransport collNetTransport = {
|
||||
"COL",
|
||||
canConnect,
|
||||
{ sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
|
||||
{ recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
|
||||
{ sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
|
||||
{ recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
|
||||
};
|
||||
|
||||
+17
-13
@@ -179,8 +179,9 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
req.connIndex = connIndex;
|
||||
|
||||
int proxyRank;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
||||
int64_t netId;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
|
||||
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
|
||||
tpProxyRank = comm->topParentRanks[proxyRank];
|
||||
@@ -216,8 +217,9 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
|
||||
// Use myInfo->rank as the receiver uses its own NIC
|
||||
int proxyRank, tpProxyRank;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
|
||||
int64_t netId;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
|
||||
|
||||
// Determine whether we need to flush the GDR buffer on recv or not
|
||||
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
|
||||
@@ -347,6 +349,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
|
||||
struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
|
||||
send->conn.tail = &recvMem->tail;
|
||||
send->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
send->conn.connFifo = recvMem->connFifo;
|
||||
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
@@ -412,6 +415,7 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
|
||||
void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
|
||||
recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
|
||||
recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
recv->conn.connFifo = recvMem->connFifo;
|
||||
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
@@ -1035,7 +1039,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
sub->posted = sub->transmitted = sub->done = 0;
|
||||
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
|
||||
if (sub->reg && sub->nbytes > 0) {
|
||||
NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->buffer, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
|
||||
NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
|
||||
} else {
|
||||
sub->mhandle = resources->mhandles[args->protocol];
|
||||
}
|
||||
@@ -1110,7 +1114,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
|
||||
}
|
||||
} else if (p == NCCL_PROTO_SIMPLE && resources->shared) {
|
||||
buff = sub->reg ? (char*)sub->buffer : localBuff+resources->recvMem->connFifo[buffSlot].offset;
|
||||
buff = sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset;
|
||||
}
|
||||
if (ready) {
|
||||
// Data is ready, try to send.
|
||||
@@ -1134,7 +1138,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (done) {
|
||||
if (sub->reg) {
|
||||
if (size < sub->nbytes) {
|
||||
sub->buffer = ((char*)sub->buffer)+size;
|
||||
sub->recvbuff += size;
|
||||
sub->nbytes -= size;
|
||||
// Do one more step (at least)
|
||||
sub->nsteps++;
|
||||
@@ -1215,7 +1219,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
|
||||
if (sub->reg && sub->nbytes > 0) {
|
||||
// Register buffer
|
||||
NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->buffer, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
|
||||
NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
|
||||
} else {
|
||||
sub->mhandle = resources->mhandles[args->protocol];
|
||||
}
|
||||
@@ -1247,7 +1251,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (sub->reg) {
|
||||
// Wait until CUDA kernel has started before we access the user buffer directly.
|
||||
if (connFifo[sub->base%NCCL_STEPS].size == -1) continue;
|
||||
ptrs[subCount] = sub->buffer;
|
||||
ptrs[subCount] = sub->recvbuff;
|
||||
sizes[subCount] = std::min(MAX_NET_SIZE, sub->nbytes);
|
||||
} else {
|
||||
int sharedBuffSlot = sub->posted%maxDepth;
|
||||
@@ -1307,7 +1311,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
int size = sizes[subIndex++];
|
||||
if (sub->reg) {
|
||||
if (size < sub->nbytes) {
|
||||
sub->buffer = ((char*)sub->buffer) + size;
|
||||
sub->recvbuff += size;
|
||||
sub->nbytes -= size;
|
||||
// Do one more step (at least)
|
||||
sub->nsteps++;
|
||||
@@ -1349,7 +1353,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
int buffSlot = (sub->base+sub->received-args->sliceSteps)%NCCL_STEPS;
|
||||
ptrs[subCount] = resources->shared ?
|
||||
(sub->reg ? sub->buffer : localBuff+resources->recvMem->connFifo[buffSlot].offset) :
|
||||
(sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset) :
|
||||
localBuff+buffSlot*stepSize;
|
||||
mhandles[subCount] = sub->mhandle;
|
||||
subCount++;
|
||||
@@ -1439,6 +1443,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
struct ncclTransport netTransport = {
|
||||
"NET",
|
||||
canConnect,
|
||||
{ sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
|
||||
{ recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
|
||||
{ sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, NULL },
|
||||
{ recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, NULL }
|
||||
};
|
||||
|
||||
+233
-20
@@ -77,7 +77,8 @@ struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
|
||||
pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static int ncclIbRelaxedOrderingEnabled = 0;
|
||||
|
||||
NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
|
||||
NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1);
|
||||
NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2);
|
||||
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18);
|
||||
NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
|
||||
NCCL_PARAM(IbPkey, "IB_PKEY", 0);
|
||||
@@ -103,6 +104,210 @@ static void* ncclIbAsyncThreadMain(void* args) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static sa_family_t envIbAddrFamily(void) {
|
||||
sa_family_t family = AF_INET;
|
||||
const char* env = ncclGetEnv("NCCL_IB_ADDR_FAMILY");
|
||||
if (env == NULL || strlen(env) == 0) {
|
||||
return family;
|
||||
}
|
||||
|
||||
INFO(NCCL_ENV, "NCCL_IB_ADDR_FAMILY set by environment to %s", env);
|
||||
|
||||
if (strcmp(env, "AF_INET") == 0) {
|
||||
family = AF_INET;
|
||||
} else if (strcmp(env, "AF_INET6") == 0) {
|
||||
family = AF_INET6;
|
||||
}
|
||||
|
||||
return family;
|
||||
}
|
||||
|
||||
static void* envIbAddrRange(sa_family_t af, int* mask) {
|
||||
*mask = 0;
|
||||
static struct in_addr addr;
|
||||
static struct in6_addr addr6;
|
||||
void *ret = (af == AF_INET) ? (void *)&addr : (void *)&addr6;
|
||||
|
||||
const char* env = ncclGetEnv("NCCL_IB_ADDR_RANGE");
|
||||
if (NULL == env || strlen(env) == 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
INFO(NCCL_ENV, "NCCL_IB_ADDR_RANGE set by environment to %s", env);
|
||||
|
||||
char addrString[128] = { 0 };
|
||||
snprintf(addrString, 128, "%s", env);
|
||||
char *addrStrPtr = addrString;
|
||||
char *maskStrPtr = strstr(addrString, "/") + 1;
|
||||
if (NULL == maskStrPtr) {
|
||||
return NULL;
|
||||
}
|
||||
*(maskStrPtr - 1) = '\0';
|
||||
|
||||
if (inet_pton(af, addrStrPtr, ret) == 0) {
|
||||
WARN("NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*mask = (int)strtol(maskStrPtr, NULL, 10);
|
||||
if (af == AF_INET && *mask > 32) {
|
||||
WARN("NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6");
|
||||
*mask = 0;
|
||||
ret = NULL;
|
||||
} else if (af == AF_INET6 && *mask > 128) {
|
||||
WARN("NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6");
|
||||
*mask = 0;
|
||||
ret = NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static sa_family_t getGidAddrFamily(union ibv_gid* gid) {
|
||||
const struct in6_addr *a = (struct in6_addr *)gid->raw;
|
||||
bool isIpV4Mapped = ((a->s6_addr32[0] | a->s6_addr32[1]) | (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL;
|
||||
bool isIpV4MappedMulticast = (a->s6_addr32[0] == htonl(0xff0e0000) && ((a->s6_addr32[1] | (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL));
|
||||
return (isIpV4Mapped || isIpV4MappedMulticast) ? AF_INET : AF_INET6;
|
||||
}
|
||||
|
||||
static bool matchGidAddrPrefix(sa_family_t af, void* prefix, int prefixlen, union ibv_gid* gid) {
|
||||
struct in_addr *base = NULL;
|
||||
struct in6_addr *base6 = NULL;
|
||||
struct in6_addr *addr6 = NULL;;
|
||||
if (af == AF_INET) {
|
||||
base = (struct in_addr *)prefix;
|
||||
} else {
|
||||
base6 = (struct in6_addr *)prefix;
|
||||
}
|
||||
addr6 = (struct in6_addr *)gid->raw;
|
||||
|
||||
#define NETMASK(bits) (htonl(0xffffffff ^ ((1 << (32 - bits)) - 1)))
|
||||
|
||||
int i = 0;
|
||||
while (prefixlen > 0 && i < 4) {
|
||||
if (af == AF_INET) {
|
||||
int mask = NETMASK(prefixlen);
|
||||
if ((base->s_addr & mask) ^ (addr6->s6_addr32[3] & mask)) {
|
||||
break;
|
||||
}
|
||||
prefixlen = 0;
|
||||
break;
|
||||
} else {
|
||||
if (prefixlen >= 32) {
|
||||
if (base6->s6_addr32[i] ^ addr6->s6_addr32[i]) {
|
||||
break;
|
||||
}
|
||||
prefixlen -= 32;
|
||||
++i;
|
||||
} else {
|
||||
int mask = NETMASK(prefixlen);
|
||||
if ((base6->s6_addr32[i] & mask) ^ (addr6->s6_addr32[i] & mask)) {
|
||||
break;
|
||||
}
|
||||
prefixlen = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (prefixlen == 0) ? true : false;
|
||||
}
|
||||
|
||||
static bool configuredGid(union ibv_gid* gid) {
|
||||
const struct in6_addr *a = (struct in6_addr *)gid->raw;
|
||||
int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
|
||||
if (((a->s6_addr32[0] | trailer) == 0UL) || ((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool linkLocalGid(union ibv_gid* gid) {
|
||||
const struct in6_addr *a = (struct in6_addr *)gid->raw;
|
||||
if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool validGid(union ibv_gid* gid) {
|
||||
return (configuredGid(gid) && !linkLocalGid(gid));
|
||||
}
|
||||
|
||||
static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum, int gidIndex, int* version) {
|
||||
char gidRoceVerStr[16] = { 0 };
|
||||
char roceTypePath[PATH_MAX] = { 0 };
|
||||
sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d", deviceName, portNum, gidIndex);
|
||||
|
||||
int fd = open(roceTypePath, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
return ncclSystemError;
|
||||
}
|
||||
int ret = read(fd, gidRoceVerStr, 15);
|
||||
close(fd);
|
||||
|
||||
if (ret == -1) {
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
if (strlen(gidRoceVerStr)) {
|
||||
if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0 || strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
|
||||
*version = 1;
|
||||
} else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
|
||||
*version = 2;
|
||||
}
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclUpdateGidIndex(struct ibv_context* context, uint8_t portNum, sa_family_t af, void* prefix, int prefixlen, int roceVer, int gidIndexCandidate, int* gidIndex) {
|
||||
union ibv_gid gid, gidCandidate;
|
||||
NCCLCHECK(wrap_ibv_query_gid(context, portNum, *gidIndex, &gid));
|
||||
NCCLCHECK(wrap_ibv_query_gid(context, portNum, gidIndexCandidate, &gidCandidate));
|
||||
|
||||
sa_family_t usrFam = af;
|
||||
sa_family_t gidFam = getGidAddrFamily(&gid);
|
||||
sa_family_t gidCandidateFam = getGidAddrFamily(&gidCandidate);
|
||||
bool gidCandidateMatchSubnet = matchGidAddrPrefix(usrFam, prefix, prefixlen, &gidCandidate);
|
||||
|
||||
if (gidCandidateFam != gidFam && gidCandidateFam == usrFam && gidCandidateMatchSubnet) {
|
||||
*gidIndex = gidIndexCandidate;
|
||||
} else {
|
||||
if (gidCandidateFam != usrFam || !validGid(&gidCandidate) || !gidCandidateMatchSubnet) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
int usrRoceVer = roceVer;
|
||||
int gidRoceVerNum, gidRoceVerNumCandidate;
|
||||
const char* deviceName = wrap_ibv_get_device_name(context->device);
|
||||
NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, *gidIndex, &gidRoceVerNum));
|
||||
NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, gidIndexCandidate, &gidRoceVerNumCandidate));
|
||||
if ((gidRoceVerNum != gidRoceVerNumCandidate || !validGid(&gid)) && gidRoceVerNumCandidate == usrRoceVer) {
|
||||
*gidIndex = gidIndexCandidate;
|
||||
}
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclIbGetGidIndex(struct ibv_context *context, uint8_t portNum, int gidTblLen, int *gidIndex) {
|
||||
*gidIndex = ncclParamIbGidIndex();
|
||||
if (*gidIndex >= 0) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
sa_family_t userAddrFamily = envIbAddrFamily();
|
||||
int userRoceVersion = ncclParamIbRoceVersionNum();
|
||||
int prefixlen;
|
||||
void *prefix = envIbAddrRange(userAddrFamily, &prefixlen);
|
||||
|
||||
*gidIndex = 0;
|
||||
for (int gidIndexNext = 1; gidIndexNext < gidTblLen; ++gidIndexNext) {
|
||||
NCCLCHECK(ncclUpdateGidIndex(context, portNum, userAddrFamily, prefix, prefixlen, userRoceVersion, gidIndexNext, gidIndex));
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
|
||||
NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1);
|
||||
NCCL_PARAM(IbMergeNics, "IB_MERGE_NICS", 1);
|
||||
@@ -182,6 +387,7 @@ int ncclIbFindMatchingDev(int dev) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
ncclResult_t ret;
|
||||
if (ncclParamIbDisable()) return ncclInternalError;
|
||||
static int shownIbHcaEnv = 0;
|
||||
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
|
||||
@@ -194,7 +400,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
ncclNMergedIbDevs = 0;
|
||||
if (ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
|
||||
WARN("NET/IB : No IP interface found.");
|
||||
return ncclInternalError;
|
||||
ret = ncclInternalError;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
// Detect IB cards
|
||||
@@ -211,7 +418,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
if (searchExact) userIbEnv++;
|
||||
int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
|
||||
|
||||
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
|
||||
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; }
|
||||
|
||||
for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) {
|
||||
struct ibv_context * context;
|
||||
@@ -224,7 +431,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
memset(&devAttr, 0, sizeof(devAttr));
|
||||
if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
|
||||
WARN("NET/IB : Unable to query device %s", devices[d]->name);
|
||||
if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
|
||||
if (ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
|
||||
continue;
|
||||
}
|
||||
for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) {
|
||||
@@ -244,6 +451,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
|
||||
ncclIbDevs[ncclNIbDevs].device = d;
|
||||
ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
|
||||
ncclIbDevs[ncclNIbDevs].portAttr = portAttr;
|
||||
ncclIbDevs[ncclNIbDevs].portNum = port_num;
|
||||
ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
|
||||
ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
|
||||
@@ -295,9 +503,9 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
ncclNIbDevs++;
|
||||
nPorts++;
|
||||
}
|
||||
if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
|
||||
if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
|
||||
}
|
||||
if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
|
||||
if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { ret = ncclInternalError; goto fail; };
|
||||
}
|
||||
if (ncclNIbDevs == 0) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found.");
|
||||
@@ -333,6 +541,9 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
pthread_mutex_unlock(&ncclIbLock);
|
||||
}
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
pthread_mutex_unlock(&ncclIbLock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbDevices(int* ndev) {
|
||||
@@ -484,6 +695,7 @@ struct ncclIbHandle {
|
||||
struct ncclIbGidInfo {
|
||||
uint8_t link_layer;
|
||||
union ibv_gid localGid;
|
||||
int32_t localGidIndex;
|
||||
};
|
||||
|
||||
#define NCCL_NET_IB_REQ_UNUSED 0
|
||||
@@ -516,7 +728,7 @@ struct ncclIbNetCommDevBase {
|
||||
int ibDevN;
|
||||
struct ibv_pd* pd;
|
||||
struct ibv_cq* cq;
|
||||
uint64_t pad[1];
|
||||
uint64_t pad[2];
|
||||
struct ncclIbGidInfo gidInfo;
|
||||
};
|
||||
|
||||
@@ -698,7 +910,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t dest_qp_num, struct ncclIbDevInfo* info) {
|
||||
ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint8_t sGidIndex, uint32_t dest_qp_num, struct ncclIbDevInfo* info) {
|
||||
struct ibv_qp_attr qpAttr;
|
||||
memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
|
||||
qpAttr.qp_state = IBV_QPS_RTR;
|
||||
@@ -712,7 +924,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t dest_qp_num, struct ncclIbD
|
||||
qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn;
|
||||
qpAttr.ah_attr.grh.dgid.global.interface_id = info->iid;
|
||||
qpAttr.ah_attr.grh.flow_label = 0;
|
||||
qpAttr.ah_attr.grh.sgid_index = ncclParamIbGidIndex();
|
||||
qpAttr.ah_attr.grh.sgid_index = sGidIndex;
|
||||
qpAttr.ah_attr.grh.hop_limit = 255;
|
||||
qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc();
|
||||
} else {
|
||||
@@ -818,9 +1030,6 @@ ib_connect_check:
|
||||
for (int i = 0; i < comm->base.ndevs; i++) {
|
||||
ncclIbSendCommDev* commDev = comm->devs + i;
|
||||
ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
|
||||
// Send my QP Info to receiver through the socket. Hope this won't block.
|
||||
// TODO - I thought I queried this in init?
|
||||
NCCLCHECK(wrap_ibv_query_port(ibDev->context, ibDev->portNum, &ibDev->portAttr));
|
||||
|
||||
// Write to the metadata struct via this pointer
|
||||
ncclIbDevInfo* devInfo = meta.devs + i;
|
||||
@@ -835,7 +1044,8 @@ ib_connect_check:
|
||||
// RoCE support
|
||||
devInfo->link_layer = commDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
|
||||
if (devInfo->link_layer == IBV_LINK_LAYER_ETHERNET) {
|
||||
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, ncclParamIbGidIndex(), &commDev->base.gidInfo.localGid));
|
||||
NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, ibDev->portAttr.gid_tbl_len, &commDev->base.gidInfo.localGidIndex));
|
||||
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid));
|
||||
devInfo->spn = commDev->base.gidInfo.localGid.global.subnet_prefix;
|
||||
devInfo->iid = commDev->base.gidInfo.localGid.global.interface_id;
|
||||
}
|
||||
@@ -854,7 +1064,7 @@ ib_connect_check:
|
||||
if (comm->base.qps[q].devIndex == i)
|
||||
INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
|
||||
comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
|
||||
commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, ncclParamIbGidIndex(),
|
||||
commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, (int64_t)commDev->base.gidInfo.localGidIndex,
|
||||
devInfo->spn, devInfo->iid, devInfo->fifoRkey, commDev->fifoMr->lkey);
|
||||
}
|
||||
}
|
||||
@@ -923,12 +1133,15 @@ ib_connect:
|
||||
|
||||
// Assign per-QP remDev
|
||||
comm->base.qps[q].remDevIdx = remQpInfo->devIndex;
|
||||
int devIndex = comm->base.qps[q].devIndex;
|
||||
ncclIbSendCommDev* commDev = comm->devs + devIndex;
|
||||
uint8_t gidIndex = commDev->base.gidInfo.localGidIndex;
|
||||
|
||||
struct ibv_qp* qp = comm->base.qps[q].qp;
|
||||
if (remQpInfo->ece_supported && remQpInfo->ece_supported)
|
||||
NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported));
|
||||
|
||||
NCCLCHECK(ncclIbRtrQp(qp, remQpInfo->qpn, remDevInfo));
|
||||
NCCLCHECK(ncclIbRtrQp(qp, gidIndex, remQpInfo->qpn, remDevInfo));
|
||||
NCCLCHECK(ncclIbRtsQp(qp));
|
||||
}
|
||||
|
||||
@@ -1024,8 +1237,8 @@ ib_recv:
|
||||
ibDevN = mergedDev->devs[i];
|
||||
NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base));
|
||||
ibDev = ncclIbDevs + ibDevN;
|
||||
NCCLCHECK(wrap_ibv_query_port(ibDev->context, ibDev->portNum, &ibDev->portAttr));
|
||||
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, ncclParamIbGidIndex(), &rCommDev->base.gidInfo.localGid));
|
||||
NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, ibDev->portAttr.gid_tbl_len, &rCommDev->base.gidInfo.localGidIndex));
|
||||
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid));
|
||||
}
|
||||
|
||||
// Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
|
||||
@@ -1064,7 +1277,7 @@ ib_recv:
|
||||
NCCLCHECK(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclIbRtrQp(qp->qp, remMeta.qpInfo[q].qpn, remDevInfo));
|
||||
NCCLCHECK(ncclIbRtrQp(qp->qp, rCommDev->base.gidInfo.localGidIndex, remMeta.qpInfo[q].qpn, remDevInfo));
|
||||
NCCLCHECK(ncclIbRtsQp(qp->qp));
|
||||
}
|
||||
|
||||
@@ -1097,7 +1310,7 @@ ib_recv:
|
||||
devInfo.spn = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
|
||||
devInfo.iid = rCommDev->base.gidInfo.localGid.global.interface_id;
|
||||
devInfo.mtu = ibDev->portAttr.active_mtu;
|
||||
NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo));
|
||||
NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, rCommDev->base.gidInfo.localGidIndex, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo));
|
||||
NCCLCHECK(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp));
|
||||
}
|
||||
|
||||
@@ -1724,7 +1937,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (req->nreqs == 1) {
|
||||
req->recv.sizes[0] += wc->imm_data;
|
||||
req->recv.sizes[0] = wc->imm_data;
|
||||
}
|
||||
}
|
||||
req->events[i]--;
|
||||
|
||||
+31
-16
@@ -46,12 +46,12 @@ struct ncclTransport nvlsTransport = {
|
||||
{ NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, int nranks, size_t size) {
|
||||
ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, size_t size) {
|
||||
CUmulticastObjectProp* prop = &resources->properties;
|
||||
memset(prop, 0, sizeof(*prop));
|
||||
prop->size = size;
|
||||
prop->numDevices = nranks;
|
||||
prop->handleTypes = NVLS_CU_MEM_HANDLE_TYPE;
|
||||
prop->numDevices = comm->MNNVL ? comm->clique.size : comm->localRanks;
|
||||
prop->handleTypes = ncclCuMemHandleType;
|
||||
prop->flags = 0;
|
||||
|
||||
// Could be changed to CU_MULTICAST_GRANULARITY_MINIMUM when 3418538 resolved
|
||||
@@ -70,6 +70,7 @@ ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes*
|
||||
}
|
||||
|
||||
ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) {
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
size_t size = prop->size;
|
||||
|
||||
// Create a Multicast group
|
||||
@@ -77,9 +78,9 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop,
|
||||
INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zi on rank %d", nranks, size, rank);
|
||||
CUCHECK(cuMulticastCreate(mcHandle, prop));
|
||||
|
||||
if ((NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) && (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)) {
|
||||
if (type == CU_MEM_HANDLE_TYPE_FABRIC) {
|
||||
// Get a handle to pass to other ranks
|
||||
CUCHECK(cuMemExportToShareableHandle(shareableHandle, *mcHandle, NVLS_CU_MEM_HANDLE_TYPE, 0));
|
||||
CUCHECK(cuMemExportToShareableHandle(shareableHandle, *mcHandle, ncclCuMemHandleType, 0));
|
||||
}
|
||||
else {
|
||||
memcpy(shareableHandle, mcHandle, sizeof(CUmemGenericAllocationHandle));
|
||||
@@ -97,7 +98,7 @@ ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes*
|
||||
}
|
||||
|
||||
ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) {
|
||||
CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
|
||||
INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
|
||||
|
||||
@@ -113,7 +114,7 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int
|
||||
CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type));
|
||||
(void) close(fd);
|
||||
} else {
|
||||
if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) {
|
||||
if (type == CU_MEM_HANDLE_TYPE_FABRIC) {
|
||||
CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type));
|
||||
} else {
|
||||
memcpy(mcHandle, shareableHandle, sizeof(CUmemGenericAllocationHandle));
|
||||
@@ -136,7 +137,7 @@ ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* r
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.location.id = resources->dev;
|
||||
prop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE;
|
||||
prop.requestedHandleTypes = ncclCuMemHandleType;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||
resources->ucGran = granularity;
|
||||
|
||||
@@ -229,6 +230,7 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct ncclNvlsSharedRes*
|
||||
|
||||
NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2);
|
||||
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
|
||||
NCCL_PARAM(NvlsChunkSize, "NVLS_CHUNKSIZE", 128*1024);
|
||||
|
||||
ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
|
||||
comm->nvlsSupport = 0;
|
||||
@@ -236,8 +238,7 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
|
||||
|
||||
int gpuCount;
|
||||
NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
|
||||
// NVLS is not supported on MNNVL yet
|
||||
if (!ncclParamNvlsEnable() || gpuCount <= 2 || comm->nNodes > 1 || comm->MNNVL) return ncclSuccess;
|
||||
if (!ncclParamNvlsEnable() || ((!comm->MNNVL && gpuCount <= 2) || (comm->MNNVL && comm->clique.size <= 2))) return ncclSuccess;
|
||||
|
||||
CUdevice dev;
|
||||
int driverVersion;
|
||||
@@ -306,7 +307,8 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
NCCLCHECK(initNvlsChannel(comm, c, parent, false));
|
||||
}
|
||||
|
||||
size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
|
||||
int nvlsStepSize = comm->nvlsChunkSize = ncclParamNvlsChunkSize();
|
||||
size_t buffSize = nvlsStepSize * NCCL_STEPS;
|
||||
size_t memSize = NVLS_MEM_ALIGN_SIZE;
|
||||
size_t nvlsPerRankSize = nChannels * 2 * (buffSize + memSize);
|
||||
size_t nvlsTotalSize = nvlsPerRankSize * nHeads;
|
||||
@@ -315,7 +317,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
comm, headRank, nHeads, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize);
|
||||
|
||||
char* shareableHandle = resources->shareableHandle;
|
||||
NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, comm->localRanks, nvlsTotalSize), res, cleanup);
|
||||
NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, nvlsTotalSize), res, cleanup);
|
||||
if (comm->localRank == 0) {
|
||||
NCCLCHECKGOTO(nvlsGroupCreate(comm, &resources->properties, comm->localRank, comm->localRanks, &resources->mcHandle, shareableHandle), res, cleanup);
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
|
||||
@@ -326,8 +328,14 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
|
||||
NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
|
||||
NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup);
|
||||
// Local intra-node barrier to ensure everyone has bound their memory to the group
|
||||
NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
|
||||
if (comm->localRanks > 1) {
|
||||
// Local intra-node barrier to ensure everyone has bound their memory to the group
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup);
|
||||
}
|
||||
if (comm->MNNVL) {
|
||||
// MNNVL: Clique wide barrier to ensure everyone has bound their memory to the group
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, comm->clique.ranks[0]), res, cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup);
|
||||
|
||||
for (int h = 0; h < nHeads; h++) {
|
||||
@@ -343,11 +351,13 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
|
||||
peer->send[1].conn.head = (uint64_t*)(mem + buffSize);
|
||||
peer->send[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
|
||||
peer->send[1].conn.stepSize = nvlsStepSize;
|
||||
mem = resources->mcBuff + (h * 2 * nChannels + c) * (buffSize + memSize);
|
||||
peer->recv[0].transportComm = &nvlsTransport.recv;
|
||||
peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
|
||||
peer->recv[0].conn.head = (uint64_t*)(mem + buffSize);
|
||||
peer->recv[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
|
||||
peer->recv[0].conn.stepSize = nvlsStepSize;
|
||||
peer->recv[0].conn.flags |= NCCL_NVLS_MIN_POLL;
|
||||
|
||||
// Broadcast MC -> UC
|
||||
@@ -356,11 +366,13 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
|
||||
peer->recv[1].conn.head = (uint64_t*)(mem + buffSize);
|
||||
peer->recv[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
|
||||
peer->recv[1].conn.stepSize = nvlsStepSize;
|
||||
mem = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize);
|
||||
peer->send[0].transportComm = &nvlsTransport.send;
|
||||
peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem;
|
||||
peer->send[0].conn.head = (uint64_t*)(mem + buffSize);
|
||||
peer->send[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
|
||||
peer->send[0].conn.stepSize = nvlsStepSize;
|
||||
peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL;
|
||||
|
||||
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
|
||||
@@ -378,6 +390,9 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
|
||||
}
|
||||
}
|
||||
|
||||
// MNNVL does not support NVLS buffer registration
|
||||
if (comm->MNNVL) return res;
|
||||
|
||||
/* create shared memory for fast NVLS buffer registration */
|
||||
typeSize = sizeof(struct localRegData) << 1;
|
||||
|
||||
@@ -595,7 +610,7 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
|
||||
|
||||
if ((!sendNeedReg || sendbuff == NULL) && (!recvNeedReg || recvbuff == NULL)) {
|
||||
localRegBufUsed = true;
|
||||
INFO(NCCL_NVLS, "rank %d reuse local-registered sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
|
||||
INFO(NCCL_NVLS, "rank %d reuse local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -611,7 +626,7 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
|
||||
if (localRegBufUsed == false) goto fail;
|
||||
}
|
||||
|
||||
INFO(NCCL_NVLS, "rank %d successfully local-registered sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
|
||||
INFO(NCCL_NVLS, "rank %d successfully local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
|
||||
|
||||
exit:
|
||||
*outRegBufSend = (void*)regSendPtr;
|
||||
|
||||
@@ -99,12 +99,15 @@ NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0);
|
||||
static int useMemcpy = 0;
|
||||
static void initCeOperation();
|
||||
|
||||
|
||||
extern int64_t ncclParamMNNVLEnable();
|
||||
|
||||
/* Determine if two peers can communicate through p2p */
|
||||
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
initCeOperation();
|
||||
|
||||
// MNNVL support
|
||||
if (info1->hostHash != info2->hostHash) {
|
||||
if (ncclParamMNNVLEnable() != 0 && info1->hostHash != info2->hostHash) {
|
||||
NCCLCHECK(ncclTopoCheckMNNVL(topo, info1, info2, ret));
|
||||
if (*ret) return ncclSuccess;
|
||||
}
|
||||
@@ -467,6 +470,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
buff += comm->buffSizes[p];
|
||||
}
|
||||
}
|
||||
send->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
|
||||
if (useMemcpy) {
|
||||
send->conn.tail = &resources->proxyInfo.ceRecvMem->tail;
|
||||
@@ -512,6 +516,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
|
||||
recv->conn.ptrExchange = &remDevMem->ptrExchange;
|
||||
recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
|
||||
}
|
||||
recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
|
||||
char* buff = (char*)(resources->recvDevMem+1);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
@@ -749,8 +754,8 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru
|
||||
struct ncclTransport p2pTransport = {
|
||||
"P2P",
|
||||
p2pCanConnect,
|
||||
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL },
|
||||
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL }
|
||||
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, NULL },
|
||||
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, NULL }
|
||||
};
|
||||
|
||||
static void initCeOperation() {
|
||||
|
||||
@@ -150,6 +150,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
}
|
||||
send->conn.tail = &resources->devRemHostMem->tail;
|
||||
send->conn.head = &resources->devHostMem->head;
|
||||
send->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
|
||||
if (useMemcpyRecv) {
|
||||
send->conn.connFifo = resources->devRemHostMem->connFifo;
|
||||
@@ -189,6 +190,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
}
|
||||
recv->conn.head = &resources->devRemHostMem->head;
|
||||
recv->conn.tail = &resources->devHostMem->tail;
|
||||
recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
|
||||
if (useMemcpyRecv) {
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
|
||||
@@ -210,6 +212,7 @@ static ncclResult_t shmSendFree(struct ncclConnector* send) {
|
||||
NCCLCHECK(ncclShmClose(resources->hostHandle));
|
||||
NCCLCHECK(ncclShmClose(resources->remHandle));
|
||||
free(resources);
|
||||
send->transportResources = NULL;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -220,6 +223,7 @@ static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
|
||||
NCCLCHECK(ncclShmClose(resources->hostHandle));
|
||||
NCCLCHECK(ncclShmClose(resources->remHandle));
|
||||
free(resources);
|
||||
recv->transportResources = NULL;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -271,6 +275,7 @@ static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, str
|
||||
CUDACHECK(cudaEventDestroy(resources->events[i]));
|
||||
}
|
||||
free(connection->transportResources);
|
||||
connection->transportResources = NULL;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -286,6 +291,7 @@ static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, str
|
||||
CUDACHECK(cudaEventDestroy(resources->events[i]));
|
||||
}
|
||||
free(connection->transportResources);
|
||||
connection->transportResources = NULL;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -409,8 +415,8 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru
|
||||
struct ncclTransport shmTransport = {
|
||||
"SHM",
|
||||
shmCanConnect,
|
||||
{ shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL },
|
||||
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL }
|
||||
{ shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL, NULL },
|
||||
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
static void initCeOperation() {
|
||||
|
||||
Referencia en una nueva incidencia
Block a user