6aae379278
Network user buffer support for collectives * Leverage user buffer registration to achieve zero-copy inter-node communications for Ring, NVLS and Collnet Add RAS subsystem * Create a RAS thread keeping track of all NCCL communicators. * Add a ncclras tool contacting the RAS thread and getting a report. Add fp8 support * Add support for e5m2 and e4m3 8-bit floating point operations. * Use Tree/PAT algorithms when possible for better numerical stability. Add NIC fusion * Add a NET API to ask the network plugin to fuse a set of interfaces together. * Fuse multiple NICs under the same PCI switch as a single, larger NIC. Socket connection failure retry * Retry in case of socket connection failure (unreachable host) * Avoid "Software caused connection abort" errors on retries QP connection failure retry * Retry in case of IB QP connection failure during ibv_modify_qp. NET API improvements * Allow plugins to force a flush in case data and completion ordering is not guaranteed. * Indicate when completion is not needed (e.g. for the LL128 protocol), allowing plugins to skip generating a completion. * Allow for full offload of allgather operations when using one GPU per node. NCCL_ALGO/NCCL_PROTO strict enforcement * Extend NCCL_ALGO/NCCL_PROTO syntax to be able to specify ALGO/PROTO filters for each collective operation. * Strictly enforce the ALGO/PROTO filters, no longer fall back on the ring algorithm when the filtering leaves no option and error out instead. Enable CUMEM host allocations * Use cumem functions for host memory allocation by default. Improved profiler plugin API * Avoid dependencies with NCCL includes. * Add information on whether the buffer is registered or not Adjust PAT tuning * Improve transition between PAT and ring at scale. Fix hangs when running with different CPU architectures * Detect when we use a mix of GPU architectures * Ensure Algo/Proto decisions are made based on that unified state. Fix FD leak in UDS * Fix a leak when mapping buffers intra-node with cumem IPCs. Fix crash when mixing buffer registration and graph buffer registration. * Separate local and graph registration to avoid crashes when we free buffers. Fix user buffer registration with dmabuf * Make ncclSend/ncclRecv communication with buffer registration functional on network plugins relying on dmabuf for buffer registration. Fix crash in IB code caused by uninitialized fields. Fix non-blocking ncclSend/ncclRecv * Fix case where ncclSend/ncclRecv would return ncclSuccess in non-blocking mode even though the operation was not enqueued onto the stream. * Issue #1495 Various compiler tweaks and fixes * PR #758 Fix typo in ncclTopoPrintGraph * Issue #1468
180 строки
6.8 KiB
C++
180 строки
6.8 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "argcheck.h" // Need some checks here since we access comm
|
|
#include "nccl.h"
|
|
#include "comm.h"
|
|
#include "net.h"
|
|
#include "register.h"
|
|
#include "transport.h"
|
|
|
|
ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
|
|
struct ncclRegCache* cache = &comm->regCache;
|
|
uintptr_t pageSize = cache->pageSize;
|
|
uintptr_t addr = (uintptr_t)data & -pageSize;
|
|
size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
|
|
|
|
*reg = NULL;
|
|
for (int slot=0; /*true*/; slot++) {
|
|
if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
|
|
if ((addr >= cache->slots[slot]->addr) &&
|
|
((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
|
|
*reg = cache->slots[slot];
|
|
return ncclSuccess;
|
|
}
|
|
}
|
|
}
|
|
NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
|
|
|
|
ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid) {
|
|
if (reg && isValid) {
|
|
if (reg->localRefs)
|
|
*isValid = true;
|
|
else
|
|
*isValid = false;
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, bool isGraph, void** handle) {
|
|
NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
|
|
struct ncclRegCache* cache = &comm->regCache;
|
|
uintptr_t pageSize = cache->pageSize;
|
|
uintptr_t addr = (uintptr_t)data & -pageSize;
|
|
size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
|
|
|
|
if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(data, comm, "buff", "ncclCommRegister"));
|
|
INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
|
|
|
|
for (int slot=0; /*true*/; slot++) {
|
|
if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
|
|
if (cache->population == cache->capacity) { // must grow cache
|
|
cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
|
|
NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
|
|
}
|
|
memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
|
|
NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
|
|
struct ncclReg* regSlot = cache->slots[slot];
|
|
regSlot->addr = addr;
|
|
regSlot->pages = pages;
|
|
if (isGraph) regSlot->graphRefs = 1;
|
|
else regSlot->localRefs = 1;
|
|
cache->population += 1;
|
|
*handle = regSlot;
|
|
goto exit;
|
|
} else if ((addr >= cache->slots[slot]->addr) &&
|
|
((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
|
|
if (isGraph) cache->slots[slot]->graphRefs++;
|
|
else cache->slots[slot]->localRefs++;
|
|
*handle = cache->slots[slot];
|
|
goto exit;
|
|
}
|
|
}
|
|
|
|
exit:
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static ncclResult_t regCleanup(struct ncclComm* comm, struct ncclReg* reg) {
|
|
if (reg->state & NET_REG_COMPLETE) {
|
|
struct ncclRegNetHandles* netHandle = reg->netHandleHead;
|
|
struct ncclRegNetHandles* netHandlePrev;
|
|
while(netHandle) {
|
|
if (ncclNetDeregBuffer(comm, netHandle->proxyConn, netHandle->handle) != ncclSuccess) {
|
|
WARN("rank %d deregister NET buffer handle %p proxy rank %d failed\n", comm->rank, netHandle->handle, netHandle->proxyConn->rank);
|
|
}
|
|
netHandlePrev = netHandle;
|
|
netHandle = netHandle->next;
|
|
free(netHandlePrev);
|
|
}
|
|
}
|
|
if (reg->state & NVLS_REG_COMPLETE) {
|
|
if (ncclNvlsDeregBuffer(comm, ®->mcHandle, reg->regAddr, reg->dev, reg->regSize) != ncclSuccess) {
|
|
WARN("rank %d deregister NVLS buffer %p dev %d size %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regSize);
|
|
}
|
|
reg->regAddr = (CUdeviceptr)NULL;
|
|
}
|
|
if (reg->state & COLLNET_REG_COMPLETE) {
|
|
if (ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle) != ncclSuccess) {
|
|
WARN("rank %d deregister COLLNET buffer handle %p proxy rank %d failed", comm->rank, reg->collnetHandle, reg->collnetProxyconn->rank);
|
|
}
|
|
}
|
|
if (reg->state & IPC_REG_COMPLETE) {
|
|
for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i)
|
|
if (reg->ipcInfos[i]) {
|
|
if (ncclIpcDeregBuffer(comm, reg->ipcInfos[i]) != ncclSuccess) {
|
|
WARN("rank %d deregister IPC buffer %p peerRank %d failed", comm->rank, reg->ipcInfos[i]->baseAddr, reg->ipcInfos[i]->peerRank);
|
|
}
|
|
free(reg->ipcInfos[i]);
|
|
}
|
|
if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs);
|
|
if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs));
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
|
|
struct ncclRegCache* cache = &comm->regCache;
|
|
for (int i = 0; i < cache->population; i++) {
|
|
struct ncclReg* reg = cache->slots[i];
|
|
INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)reg->addr, reg->pages);
|
|
NCCLCHECK(regCleanup(comm, reg));
|
|
free(reg);
|
|
}
|
|
free(cache->slots);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
|
|
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
|
|
if (!ncclParamLocalRegister())
|
|
*handle = NULL;
|
|
else
|
|
NCCLCHECK(ncclRegister(comm, buff, size, false, handle));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
|
|
NCCLCHECK(ncclRegister(comm, buff, size, true, handle));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
static ncclResult_t commDeregister(struct ncclComm *comm, bool isGraph, struct ncclReg* reg) {
|
|
NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
|
|
struct ncclRegCache* cache = &comm->regCache;
|
|
int slot;
|
|
int saveDev;
|
|
if (reg == NULL) goto exit;
|
|
CUDACHECK(cudaGetDevice(&saveDev));
|
|
CUDACHECK(cudaSetDevice(comm->cudaDev));
|
|
for (slot = 0; slot < cache->population && cache->slots[slot] != reg; slot++);
|
|
if (slot == cache->population) {
|
|
WARN("Deregister: Could not find handle");
|
|
return ncclInvalidUsage;
|
|
}
|
|
if (isGraph) --reg->graphRefs;
|
|
else --reg->localRefs;
|
|
if (reg->localRefs || reg->graphRefs) return ncclSuccess;
|
|
NCCLCHECK(regCleanup(comm, reg));
|
|
free(reg);
|
|
memmove(cache->slots + slot, cache->slots + slot + 1, (cache->population - slot - 1) * sizeof(struct ncclReg*));
|
|
cache->population -= 1;
|
|
CUDACHECK(cudaSetDevice(saveDev));
|
|
exit:
|
|
return ncclSuccess;
|
|
}
|
|
|
|
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
|
|
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void *handle) {
|
|
NCCLCHECK(commDeregister(comm, false, (struct ncclReg*)handle));
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle) {
|
|
NCCLCHECK(commDeregister(comm, true, handle));
|
|
return ncclSuccess;
|
|
}
|