Files
rocm-systems/src/register/register.cc
T
Sylvain Jeaugey 6aae379278 2.24.3-1
Network user buffer support for collectives
 * Leverage user buffer registration to achieve zero-copy
   inter-node communications for Ring, NVLS and Collnet

Add RAS subsystem
 * Create a RAS thread keeping track of all NCCL communicators.
 * Add a ncclras tool contacting the RAS thread and getting a
   report.

Add fp8 support
 * Add support for e5m2 and e4m3 8-bit floating point operations.
 * Use Tree/PAT algorithms when possible for better numerical
   stability.

Add NIC fusion
 * Add a NET API to ask the network plugin to fuse a set of
   interfaces together.
 * Fuse multiple NICs under the same PCI switch as a single,
   larger NIC.

Socket connection failure retry
 * Retry in case of socket connection failure (unreachable host)
 * Avoid "Software caused connection abort" errors on retries

QP connection failure retry
 * Retry in case of IB QP connection failure during ibv_modify_qp.

NET API improvements
 * Allow plugins to force a flush in case data and completion
   ordering is not guaranteed.
 * Indicate when completion is not needed (e.g. for the LL128
   protocol), allowing plugins to skip generating a completion.
 * Allow for full offload of allgather operations when using one
   GPU per node.

NCCL_ALGO/NCCL_PROTO strict enforcement
 * Extend NCCL_ALGO/NCCL_PROTO syntax to be able to specify
   ALGO/PROTO filters for each collective operation.
 * Strictly enforce the ALGO/PROTO filters, no longer fall back
   on the ring algorithm when the filtering leaves no option and
   error out instead.

Enable CUMEM host allocations
 * Use cumem functions for host memory allocation by default.

Improved profiler plugin API
 * Avoid dependencies with NCCL includes.
 * Add information on whether the buffer is registered or not

Adjust PAT tuning
 * Improve transition between PAT and ring at scale.

Fix hangs when running with different CPU architectures
 * Detect when we use a mix of GPU architectures
 * Ensure Algo/Proto decisions are made based on that unified
   state.

Fix FD leak in UDS
 * Fix a leak when mapping buffers intra-node with cumem IPCs.

Fix crash when mixing buffer registration and graph buffer registration.
 * Separate local and graph registration to avoid crashes when we free
   buffers.

Fix user buffer registration with dmabuf
 * Make ncclSend/ncclRecv communication with buffer registration functional
   on network plugins relying on dmabuf for buffer registration.

Fix crash in IB code caused by uninitialized fields.

Fix non-blocking ncclSend/ncclRecv
 * Fix case where ncclSend/ncclRecv would return ncclSuccess in non-blocking
   mode even though the operation was not enqueued onto the stream.
 * Issue #1495

Various compiler tweaks and fixes
 * PR #758

Fix typo in ncclTopoPrintGraph
 * Issue #1468
2025-01-07 02:01:15 -08:00

180 строки
6.8 KiB
C++

/*************************************************************************
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "argcheck.h" // Need some checks here since we access comm
#include "nccl.h"
#include "comm.h"
#include "net.h"
#include "register.h"
#include "transport.h"
ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
struct ncclRegCache* cache = &comm->regCache;
uintptr_t pageSize = cache->pageSize;
uintptr_t addr = (uintptr_t)data & -pageSize;
size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
*reg = NULL;
for (int slot=0; /*true*/; slot++) {
if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
if ((addr >= cache->slots[slot]->addr) &&
((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
*reg = cache->slots[slot];
return ncclSuccess;
}
}
}
NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid) {
if (reg && isValid) {
if (reg->localRefs)
*isValid = true;
else
*isValid = false;
}
return ncclSuccess;
}
ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, bool isGraph, void** handle) {
NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
struct ncclRegCache* cache = &comm->regCache;
uintptr_t pageSize = cache->pageSize;
uintptr_t addr = (uintptr_t)data & -pageSize;
size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(data, comm, "buff", "ncclCommRegister"));
INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
for (int slot=0; /*true*/; slot++) {
if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
if (cache->population == cache->capacity) { // must grow cache
cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
}
memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
struct ncclReg* regSlot = cache->slots[slot];
regSlot->addr = addr;
regSlot->pages = pages;
if (isGraph) regSlot->graphRefs = 1;
else regSlot->localRefs = 1;
cache->population += 1;
*handle = regSlot;
goto exit;
} else if ((addr >= cache->slots[slot]->addr) &&
((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
if (isGraph) cache->slots[slot]->graphRefs++;
else cache->slots[slot]->localRefs++;
*handle = cache->slots[slot];
goto exit;
}
}
exit:
return ncclSuccess;
}
static ncclResult_t regCleanup(struct ncclComm* comm, struct ncclReg* reg) {
if (reg->state & NET_REG_COMPLETE) {
struct ncclRegNetHandles* netHandle = reg->netHandleHead;
struct ncclRegNetHandles* netHandlePrev;
while(netHandle) {
if (ncclNetDeregBuffer(comm, netHandle->proxyConn, netHandle->handle) != ncclSuccess) {
WARN("rank %d deregister NET buffer handle %p proxy rank %d failed\n", comm->rank, netHandle->handle, netHandle->proxyConn->rank);
}
netHandlePrev = netHandle;
netHandle = netHandle->next;
free(netHandlePrev);
}
}
if (reg->state & NVLS_REG_COMPLETE) {
if (ncclNvlsDeregBuffer(comm, &reg->mcHandle, reg->regAddr, reg->dev, reg->regSize) != ncclSuccess) {
WARN("rank %d deregister NVLS buffer %p dev %d size %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regSize);
}
reg->regAddr = (CUdeviceptr)NULL;
}
if (reg->state & COLLNET_REG_COMPLETE) {
if (ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle) != ncclSuccess) {
WARN("rank %d deregister COLLNET buffer handle %p proxy rank %d failed", comm->rank, reg->collnetHandle, reg->collnetProxyconn->rank);
}
}
if (reg->state & IPC_REG_COMPLETE) {
for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i)
if (reg->ipcInfos[i]) {
if (ncclIpcDeregBuffer(comm, reg->ipcInfos[i]) != ncclSuccess) {
WARN("rank %d deregister IPC buffer %p peerRank %d failed", comm->rank, reg->ipcInfos[i]->baseAddr, reg->ipcInfos[i]->peerRank);
}
free(reg->ipcInfos[i]);
}
if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs);
if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs));
}
return ncclSuccess;
}
ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
struct ncclRegCache* cache = &comm->regCache;
for (int i = 0; i < cache->population; i++) {
struct ncclReg* reg = cache->slots[i];
INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)reg->addr, reg->pages);
NCCLCHECK(regCleanup(comm, reg));
free(reg);
}
free(cache->slots);
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
if (!ncclParamLocalRegister())
*handle = NULL;
else
NCCLCHECK(ncclRegister(comm, buff, size, false, handle));
return ncclSuccess;
}
ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
NCCLCHECK(ncclRegister(comm, buff, size, true, handle));
return ncclSuccess;
}
static ncclResult_t commDeregister(struct ncclComm *comm, bool isGraph, struct ncclReg* reg) {
NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
struct ncclRegCache* cache = &comm->regCache;
int slot;
int saveDev;
if (reg == NULL) goto exit;
CUDACHECK(cudaGetDevice(&saveDev));
CUDACHECK(cudaSetDevice(comm->cudaDev));
for (slot = 0; slot < cache->population && cache->slots[slot] != reg; slot++);
if (slot == cache->population) {
WARN("Deregister: Could not find handle");
return ncclInvalidUsage;
}
if (isGraph) --reg->graphRefs;
else --reg->localRefs;
if (reg->localRefs || reg->graphRefs) return ncclSuccess;
NCCLCHECK(regCleanup(comm, reg));
free(reg);
memmove(cache->slots + slot, cache->slots + slot + 1, (cache->population - slot - 1) * sizeof(struct ncclReg*));
cache->population -= 1;
CUDACHECK(cudaSetDevice(saveDev));
exit:
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void *handle) {
NCCLCHECK(commDeregister(comm, false, (struct ncclReg*)handle));
return ncclSuccess;
}
ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle) {
NCCLCHECK(commDeregister(comm, true, handle));
return ncclSuccess;
}