Add support for IB SHARP 1PPN operation with user buffers.
Improve support for MNNVL, add NVLS support and multi-clique support.
 * Detect the NVLS clique through NVML
 * Exchange XML between peers in the same NVLS clique and fuse XMLs
   before creating the topology graph.
 * Rework bootstrap allgather algorithms to allow for large allgather
   operations intra-node (XML exchange).
Net/IB: add support for dynamic GID detection.
 * Automatically select RoCEv2/IPv4 interface by default. Allow to
   select IPv6 or even the network/mask.
Reduce NVLS memory usage.
 * Add stepSize as property of a connection to allow for different
   sizes on different peers; set it to 128K for NVLink SHARP.
Improve tuner loading
 * Look for more paths, be more consistent with the network device
   plugin.
 * Also search for tuner support inside the net plugin.
Improve tuner API
 * Add context to support multi-device per process.
Add magic number around comm object to detect comm corruption.
 * Add some basic check around communicators so that we can report a
   problem when a communicator gets corrupted or a wrong comm pointer
   is passed to NCCL.
Fix net/IB error path. Github PR #1164
Fix collnet rail mapping with split comm.
Fix packet reordering issue causing bootstrap mismatch
 * Use a different tag in ncclTransportP2pSetup for the connectInfo
   exchange and the following barrier.
Fix hang when crossNic is inconsistent between ranks.
Fix minCompCap/maxCompCap computation. Github issue #1184
This commit is contained in:
Sylvain Jeaugey
2024-03-26 06:08:55 -07:00
parent 6dd51f15bf
commit ab2b89c4c3
52 changed files with 2124 additions and 944 deletions
+14 -9
View File
@@ -39,13 +39,17 @@ typedef struct {
const char* name;
// Initializes tuner states.
// nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
// nNodes: number of nodes in current communicator.
// logFunction: a logFunction can be useful to integrate logging together with NCCL core.
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
// Inputs:
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
// - nNodes: number of nodes in current communicator.
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
// Outputs:
// - context: tuner context object
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
// Inputs:
// - context: tuner context object
// - collType: collective type , e.g., allreduce, allgather…
// - nBytes: collective size in bytes
// - collNetSupport: whether collnet supports this type
@@ -62,16 +66,17 @@ typedef struct {
// Also, the plugin is allowed to not set any output, or set only the
// algorithm and protocol, but not only the algorithm or only the protocol.
// Unset fields will be set automatically by NCCL.
ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels);
// Terminates the plugin and cleans up any resources that the plugin allocated.
ncclResult_t (*destroy)();
} ncclTuner_v1_t;
// context: tuner context object
ncclResult_t (*destroy)(void* context);
} ncclTuner_v2_t;
typedef ncclTuner_v1_t ncclTuner_t;
typedef ncclTuner_v2_t ncclTuner_t;
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
#endif
+4 -4
View File
@@ -8,17 +8,17 @@
#define __hidden __attribute__ ((visibility("hidden")))
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) { return ncclSuccess; }
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
__hidden ncclResult_t pluginGetCollInfo(ncclFunc_t collType, size_t nBytes,
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
__hidden ncclResult_t pluginDestroy() { return ncclSuccess; }
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
#define PLUGIN_NAME "Example"
const ncclTuner_v1_t ncclTunerPlugin_v1 = {
const ncclTuner_v2_t ncclTunerPlugin_v2 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.getCollInfo = pluginGetCollInfo,