Files
rocm-systems/src/misc/ipcsocket.cc
T
Sylvain Jeaugey 68b542363f 2.23.4-1
Add scalable init API
 * Add new ncclCommInitRankScalable to allow for passing multiple
   unique IDs to the init function.
 * Spreads the load onto multiple bootstrap roots, allowing for
   constant bootstrap time.
 * Requires multiple ranks to create a unique ID, and the CPU-side
   ID exchange code to call allgather[v] instead of broadcast.

Accelerate init bootstrap operations
 * Reduce the number of calls to allgather.
 * Allow roots to reply early to ranks when information is already
   available.
 * Add an option to use ncclNet instead of sockets to perform
   bootstrap allgather operations.

Add PAT algorithms for Allgather and ReduceScatter
 * Parallel Aggregated Trees, variation of Bruck algorithm.
 * Logarithmic number of network steps for small sizes at scale.
 * Only supports one rank per node at the moment.

Add support for registered buffers for intra-node communication.
 * Allow registered user buffers to be accessed directly intra-node
 * Avoids extra copies in algorithms which permit it, saving
   memory bandwidth and helping with compute overlap.

Add profiler plugin API
 * New plugin API for profiling
 * Supports various levels of profiling, with a hierarchy.

Asynchronous graph allocation
 * Make calls to cudaMalloc and cudaMemcpy during graph allocation
   asynchronous.
 * Significantly speeds up graph capture.

Use fatal IB asynchronous events to stop network operation
 * Avoids many other error messages
 * Only fatal errors are affected; potentially transient errors
   (e.g. port down) do not cause an immediate stop.

Set P2P level to PXB on AMD CPUs when using more than 2 GPUs per node
 * P2P would cause a significant performance degradation when using
   many GPUs, and therefore many interleaved data flows.
 * Disable P2P through the CPU when we have 3+ GPUs per node; keep it
   enabled when we only have 2 GPUs.

Improve the init logs to report the real NCCL function.
 * Make the log report ncclCommInitRank or ncclCommSplit, rather than
   the generic ncclCommInitRankFunc.

Add a parameter to set the location of the user configuration file.
 * Add NCCL_CONF_FILE environment variable to set where the user's
   configuration file resides.

Increase default IB timeout
 * Increase IB timeout value from 18 to 20.
 * Should help avoid fatal errors on large RoCE systems.

Add new check for nvidia peermem
 * On linux kernels 6.6+, /sys/kernel/mm/memory_peers is no longer
   present; check for /sys/module/nvidia_peermem/version instead.

Fix old performance regression when mixing small and large operations.
 * Improves distribution of work on channels.

Fix crash when NUMA IDs are equal to -1.
 * Can happen when a NIC is a virtual NIC, or when linux doesn't
   know which NUMA node a device is attached to
 * Issue NVIDIA/nccl-tests#233

Fix tree graph search when NCCL_CROSS_NIC is set to 1.
 * Would force NCCL to use the balanced_tree pattern, thereby
   disabling LL128 on platforms with 1 GPU+1 NIC per PCI switch.
 * Would also try to use alternate rings even though it was not
   needed.

Compiler tweaks and fixes
 * PR #1177
 * PR #1228

Fix stack smash
 * PR #1325

Fixes for multi-node NVLink + IB operation

Coverity fixes and comments.
2024-09-16 23:41:17 -07:00

230 lines
6.4 KiB
C++

/*
* Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
*
* See COPYRIGHT for license information
*/
#include "ipcsocket.h"
#include "utils.h"
#include <stdlib.h>
#include <string.h>
#include <errno.h>
// Enable Linux abstract socket naming
#define USE_ABSTRACT_SOCKET
#define NCCL_IPC_SOCKNAME_STR "/tmp/nccl-socket-%d-%lx"
/*
* Create a Unix Domain Socket
*/
ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) {
int fd = -1;
struct sockaddr_un cliaddr;
char temp[NCCL_IPC_SOCKNAME_LEN] = "";
if (handle == NULL) {
return ncclInternalError;
}
handle->fd = -1;
handle->socketName[0] = '\0';
if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
WARN("UDS: Socket creation error : %s (%d)", strerror(errno), errno);
return ncclSystemError;
}
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
// Create unique name for the socket.
int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
if (len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot bind provided name to socket. Name too large");
close(fd);
return ncclInternalError;
}
#ifndef USE_ABSTRACT_SOCKET
unlink(temp);
#endif
TRACE(NCCL_INIT, "UDS: Creating socket %s", temp);
strncpy(cliaddr.sun_path, temp, len);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
WARN("UDS: Binding to socket %s failed : %s (%d)", temp, strerror(errno), errno);
close(fd);
return ncclSystemError;
}
handle->fd = fd;
strcpy(handle->socketName, temp);
handle->abortFlag = abortFlag;
// Mark socket as non-blocking
if (handle->abortFlag) {
int flags;
SYSCHECK(flags = fcntl(fd, F_GETFL), "fcntl");
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
}
return ncclSuccess;
}
ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd) {
if (handle == NULL) {
WARN("ncclSocketGetFd: pass NULL socket");
return ncclInvalidArgument;
}
if (fd) *fd = handle->fd;
return ncclSuccess;
}
ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
if (handle == NULL) {
return ncclInternalError;
}
if (handle->fd <= 0) {
return ncclSuccess;
}
#ifndef USE_ABSTRACT_SOCKET
if (handle->socketName[0] != '\0') {
unlink(handle->socketName);
}
#endif
close(handle->fd);
return ncclSuccess;
}
ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd) {
struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
struct iovec iov[1];
// Union to guarantee alignment requirements for control array
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr *cmptr;
char dummy_buffer[1];
int ret;
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
if (hdr == NULL) {
iov[0].iov_base = (void *)dummy_buffer;
iov[0].iov_len = sizeof(dummy_buffer);
} else {
iov[0].iov_base = hdr;
iov[0].iov_len = hdrLen;
}
msg.msg_iov = iov;
msg.msg_iovlen = 1;
while ((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Receiving data over socket failed : %d", errno);
return ncclSystemError;
}
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
}
if (recvFd != NULL) {
if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
WARN("UDS: Receiving data over socket failed");
return ncclSystemError;
}
memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
} else {
WARN("UDS: Receiving data over socket %s failed", handle->socketName);
return ncclSystemError;
}
TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
}
return ncclSuccess;
}
ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
return ncclIpcSocketRecvMsg(handle, NULL, 0, recvFd);
}
ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash) {
struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
struct iovec iov[1];
char temp[NCCL_IPC_SOCKNAME_LEN];
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr *cmptr;
char dummy_buffer[1];
struct sockaddr_un cliaddr;
// Construct client address to send this shareable handle to
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
if (len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot connect to provided name for socket. Name too large");
return ncclInternalError;
}
(void) strncpy(cliaddr.sun_path, temp, len);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
msg.msg_name = (void *)&cliaddr;
msg.msg_namelen = sizeof(struct sockaddr_un);
if (hdr == NULL) {
iov[0].iov_base = (void *)dummy_buffer;
iov[0].iov_len = sizeof(dummy_buffer);
} else {
iov[0].iov_base = hdr;
iov[0].iov_len = hdrLen;
}
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_flags = 0;
ssize_t sendResult;
while ((sendResult = sendmsg(handle->fd, &msg, 0)) < 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
return ncclSystemError;
}
if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
}
return ncclSuccess;
}
ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
return ncclIpcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash);
}