2.21.5-1
Add support for IB SHARP 1PPN operation with user buffers. Improve support for MNNVL, add NVLS support and multi-clique support. * Detect the NVLS clique through NVML * Exchange XML between peers in the same NVLS clique and fuse XMLs before creating the topology graph. * Rework bootstrap allgather algorithms to allow for large allgather operations intra-node (XML exchange). Net/IB: add support for dynamic GID detection. * Automatically select RoCEv2/IPv4 interface by default. Allow to select IPv6 or even the network/mask. Reduce NVLS memory usage. * Add stepSize as property of a connection to allow for different sizes on different peers; set it to 128K for NVLink SHARP. Improve tuner loading * Look for more paths, be more consistent with the network device plugin. * Also search for tuner support inside the net plugin. Improve tuner API * Add context to support multi-device per process. Add magic number around comm object to detect comm corruption. * Add some basic check around communicators so that we can report a problem when a communicator gets corrupted or a wrong comm pointer is passed to NCCL. Fix net/IB error path. Github PR #1164 Fix collnet rail mapping with split comm. Fix packet reordering issue causing bootstrap mismatch * Use a different tag in ncclTransportP2pSetup for the connectInfo exchange and the following barrier. Fix hang when crossNic is inconsistent between ranks. Fix minCompCap/maxCompCap computation. Github issue #1184
This commit is contained in:
@@ -39,13 +39,17 @@ typedef struct {
|
||||
const char* name;
|
||||
|
||||
// Initializes tuner states.
|
||||
// nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// nNodes: number of nodes in current communicator.
|
||||
// logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
|
||||
// Inputs:
|
||||
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// - nNodes: number of nodes in current communicator.
|
||||
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
// Outputs:
|
||||
// - context: tuner context object
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
|
||||
|
||||
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
|
||||
// Inputs:
|
||||
// - context: tuner context object
|
||||
// - collType: collective type , e.g., allreduce, allgather…
|
||||
// - nBytes: collective size in bytes
|
||||
// - collNetSupport: whether collnet supports this type
|
||||
@@ -62,16 +66,17 @@ typedef struct {
|
||||
// Also, the plugin is allowed to not set any output, or set only the
|
||||
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int *algorithm, int *protocol, int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
ncclResult_t (*destroy)();
|
||||
} ncclTuner_v1_t;
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v2_t;
|
||||
|
||||
typedef ncclTuner_v1_t ncclTuner_t;
|
||||
typedef ncclTuner_v2_t ncclTuner_t;
|
||||
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
|
||||
|
||||
#endif
|
||||
|
||||
@@ -8,17 +8,17 @@
|
||||
|
||||
#define __hidden __attribute__ ((visibility("hidden")))
|
||||
|
||||
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) { return ncclSuccess; }
|
||||
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
|
||||
|
||||
__hidden ncclResult_t pluginGetCollInfo(ncclFunc_t collType, size_t nBytes,
|
||||
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
|
||||
|
||||
__hidden ncclResult_t pluginDestroy() { return ncclSuccess; }
|
||||
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
|
||||
|
||||
#define PLUGIN_NAME "Example"
|
||||
|
||||
const ncclTuner_v1_t ncclTunerPlugin_v1 = {
|
||||
const ncclTuner_v2_t ncclTunerPlugin_v2 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.getCollInfo = pluginGetCollInfo,
|
||||
|
||||
Reference in New Issue
Block a user