2018-09-24 16:06:59 -07:00
|
|
|
/*************************************************************************
|
2022-01-07 06:39:55 -08:00
|
|
|
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
2018-09-24 16:06:59 -07:00
|
|
|
*
|
|
|
|
|
* See LICENSE.txt for license information
|
|
|
|
|
************************************************************************/
|
|
|
|
|
|
|
|
|
|
#ifndef NCCL_SOCKET_H_
|
|
|
|
|
#define NCCL_SOCKET_H_
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
#include "nccl.h"
|
2018-09-24 16:06:59 -07:00
|
|
|
#include <sys/socket.h>
|
|
|
|
|
#include <arpa/inet.h>
|
|
|
|
|
#include <netinet/tcp.h>
|
|
|
|
|
#include <netdb.h>
|
2022-01-07 06:39:55 -08:00
|
|
|
#include <fcntl.h>
|
|
|
|
|
#include <poll.h>
|
2018-09-24 16:06:59 -07:00
|
|
|
|
2018-10-24 14:44:59 -07:00
|
|
|
#define MAX_IFS 16
|
2018-09-24 16:06:59 -07:00
|
|
|
#define MAX_IF_NAME_SIZE 16
|
2020-09-04 14:35:05 -07:00
|
|
|
#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
|
2022-11-29 04:27:46 -08:00
|
|
|
#define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
|
2018-09-24 16:06:59 -07:00
|
|
|
|
|
|
|
|
/* Common socket address storage structure for IPv4/IPv6 */
|
2022-01-07 06:39:55 -08:00
|
|
|
union ncclSocketAddress {
|
2018-09-24 16:06:59 -07:00
|
|
|
struct sockaddr sa;
|
|
|
|
|
struct sockaddr_in sin;
|
|
|
|
|
struct sockaddr_in6 sin6;
|
|
|
|
|
};
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
enum ncclSocketState {
|
2022-11-29 04:27:46 -08:00
|
|
|
ncclSocketStateNone = 0,
|
|
|
|
|
ncclSocketStateInitialized = 1,
|
|
|
|
|
ncclSocketStateAccepting = 2,
|
|
|
|
|
ncclSocketStateAccepted = 3,
|
|
|
|
|
ncclSocketStateConnecting = 4,
|
|
|
|
|
ncclSocketStateConnectPolling = 5,
|
|
|
|
|
ncclSocketStateConnected = 6,
|
|
|
|
|
ncclSocketStateReady = 7,
|
2024-12-18 08:26:06 -08:00
|
|
|
ncclSocketStateTerminating = 8,
|
|
|
|
|
ncclSocketStateClosed = 9,
|
|
|
|
|
ncclSocketStateError = 10,
|
|
|
|
|
ncclSocketStateNum = 11
|
2022-11-29 04:27:46 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
enum ncclSocketType {
|
|
|
|
|
ncclSocketTypeUnknown = 0,
|
|
|
|
|
ncclSocketTypeBootstrap = 1,
|
|
|
|
|
ncclSocketTypeProxy = 2,
|
|
|
|
|
ncclSocketTypeNetSocket = 3,
|
2024-12-18 08:26:06 -08:00
|
|
|
ncclSocketTypeNetIb = 4,
|
|
|
|
|
ncclSocketTypeRasNetwork = 5
|
2022-11-29 04:27:46 -08:00
|
|
|
};
|
2022-01-07 06:39:55 -08:00
|
|
|
|
|
|
|
|
struct ncclSocket {
|
|
|
|
|
int fd;
|
2022-11-29 04:27:46 -08:00
|
|
|
int acceptFd;
|
2024-12-18 08:26:06 -08:00
|
|
|
int errorRetries;
|
2022-01-07 06:39:55 -08:00
|
|
|
union ncclSocketAddress addr;
|
|
|
|
|
volatile uint32_t* abortFlag;
|
|
|
|
|
int asyncFlag;
|
|
|
|
|
enum ncclSocketState state;
|
2022-11-29 04:27:46 -08:00
|
|
|
int salen;
|
|
|
|
|
uint64_t magic;
|
|
|
|
|
enum ncclSocketType type;
|
2024-12-18 08:26:06 -08:00
|
|
|
int customRetry;
|
|
|
|
|
int finalizeCounter; // Used to keep track of initial handshake for async sockets.
|
|
|
|
|
char finalizeBuffer[sizeof(uint64_t)]; // Used to keep track of initial handshake for async sockets.
|
2022-01-07 06:39:55 -08:00
|
|
|
};
|
2018-09-24 16:06:59 -07:00
|
|
|
|
2024-12-18 08:26:06 -08:00
|
|
|
const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
|
2022-11-29 04:27:46 -08:00
|
|
|
ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
|
2025-05-29 20:56:40 -07:00
|
|
|
ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
|
|
|
|
|
union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found);
|
|
|
|
|
ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs,
|
|
|
|
|
int* nIfs);
|
2022-11-29 04:27:46 -08:00
|
|
|
|
|
|
|
|
// Initialize a socket
|
2024-12-18 08:26:06 -08:00
|
|
|
ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0, int customRetry = 0);
|
2022-01-07 06:39:55 -08:00
|
|
|
// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
|
|
|
|
|
ncclResult_t ncclSocketListen(struct ncclSocket* sock);
|
2022-11-29 04:27:46 -08:00
|
|
|
ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
|
2022-01-07 06:39:55 -08:00
|
|
|
// Connect to sock->addr. sock->fd is set after a successful call.
|
|
|
|
|
ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
|
|
|
|
|
// Return socket connection state.
|
2022-11-29 04:27:46 -08:00
|
|
|
ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running);
|
|
|
|
|
// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
|
|
|
|
|
ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock);
|
|
|
|
|
ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd);
|
|
|
|
|
ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);
|
2018-09-24 16:06:59 -07:00
|
|
|
|
2018-11-19 17:43:50 -08:00
|
|
|
#define NCCL_SOCKET_SEND 0
|
|
|
|
|
#define NCCL_SOCKET_RECV 1
|
2018-09-24 16:06:59 -07:00
|
|
|
|
2024-12-18 08:26:06 -08:00
|
|
|
ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed = NULL);
|
2022-01-07 06:39:55 -08:00
|
|
|
ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
|
|
|
|
|
ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
|
|
|
|
|
ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
|
2024-03-26 06:08:55 -07:00
|
|
|
ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
|
2023-02-27 02:48:21 -08:00
|
|
|
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
|
2024-12-18 08:26:06 -08:00
|
|
|
ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how);
|
2025-03-12 13:46:21 -07:00
|
|
|
ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait = false);
|
2018-09-24 16:06:59 -07:00
|
|
|
#endif
|