ab2b89c4c3
Add support for IB SHARP 1PPN operation with user buffers. Improve support for MNNVL, add NVLS support and multi-clique support. * Detect the NVLS clique through NVML * Exchange XML between peers in the same NVLS clique and fuse XMLs before creating the topology graph. * Rework bootstrap allgather algorithms to allow for large allgather operations intra-node (XML exchange). Net/IB: add support for dynamic GID detection. * Automatically select RoCEv2/IPv4 interface by default. Allow to select IPv6 or even the network/mask. Reduce NVLS memory usage. * Add stepSize as property of a connection to allow for different sizes on different peers; set it to 128K for NVLink SHARP. Improve tuner loading * Look for more paths, be more consistent with the network device plugin. * Also search for tuner support inside the net plugin. Improve tuner API * Add context to support multi-device per process. Add magic number around comm object to detect comm corruption. * Add some basic check around communicators so that we can report a problem when a communicator gets corrupted or a wrong comm pointer is passed to NCCL. Fix net/IB error path. Github PR #1164 Fix collnet rail mapping with split comm. Fix packet reordering issue causing bootstrap mismatch * Use a different tag in ncclTransportP2pSetup for the connectInfo exchange and the following barrier. Fix hang when crossNic is inconsistent between ranks. Fix minCompCap/maxCompCap computation. Github issue #1184
31 lines
1.1 KiB
C
31 lines
1.1 KiB
C
/*************************************************************************
|
|
* Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NET_DEVICE_H_
|
|
#define NET_DEVICE_H_
|
|
|
|
#define NCCL_NET_DEVICE_INVALID_VERSION 0x0
|
|
#define NCCL_NET_MTU_SIZE 4096
|
|
|
|
// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
|
|
// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
|
|
#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
|
|
|
|
typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
|
|
|
|
typedef struct {
|
|
ncclNetDeviceType netDeviceType; // Network offload type
|
|
int netDeviceVersion; // Version number for network offload
|
|
void* handle;
|
|
size_t size;
|
|
int needsProxyProgress;
|
|
} ncclNetDeviceHandle_v7_t;
|
|
|
|
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
|
|
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
|
|
|
|
#endif
|