2.22.3-1
Rework core for NVIDIA Trusted Computing * Compress work structs so that they are shared between channels * Utilize the full amount of kernel argument space permitted (4k) before resorting to work fifo. * Rework the task preprocessing phase. * Use a separate abortDevFlag which is kept in sync with abortFlag using cudaMemcpy operations. * Rename src/include/align.h to src/include/bitops.h Add lazy connection establishment for collective operations * Move buffer allocation and connection establishment to the first collective operation using that algorithm. * Accelerate init time and reduce memory usage. * Avoid allocating NVLS buffers if all calls are registered. * Compute algo/proto in ncclLaunchCollTasksInfo early on. * Connect peers in ncclCollPreconnectFunc if not connected already. * Also move shared buffer creation to the first send/recv call. Accelerate intra-node NVLink detection * Make each rank only detect NVLinks attached to its GPU. * Fuse XMLs to reconstruct the full NVLink topology Add init profiling to report time spend in different init phases. * Report timings of bootstrap, allgather, search, connect, etc. * Add new "PROFILE" category for NCCL_DEBUG_SUBSYS. Add support for PCI p2p on split PCI switches * Detect split PCI switches through a kernel module exposing switch information. * Update the topology XML and graph to add those inter-switch connections. Add cost estimation API * Add a new ncclGroupEndSimulate primitive to return the estimated time a group would take. Net/IB: Add separate traffic class for fifo messages * Add NCCL_IB_FIFO_TC to control the traffic class of fifo messages independently from NCCL_IB_TC. Merges PR #1194 Net/IB: Add support for IB router * Use flid instead of lid if subnets do not match * Warn if flid is 0 Optimizations and fixes for device network offload (unpack) * Double the default number of channels * Cache netDeviceType * Fix save/increment head logic to enable Tree support. Support ncclGroupStart/End for ncclCommAbort/Destroy * Allow Abort/Destroy to be called within a group when managing multiple GPUs with a single process. Improve Tuner API * Provide to the plugin the original cost table so that the plugin can leave unknown or disabled algo/proto combinations untouched. * Remove nvlsSupport and collnetSupport. Do not print version to stdout when using a debug file * Also print version from all processes with INFO debug level. Fixes issue #1271 Fix clang warnings in NVTX headers * Update NVTX headers to the latest version Fixes issue #1270 Disable port fusion in heterogeneous systems * Do not fuse ports if a mix of multi-port and single port are detected. Fix NVLS graphs search for dual NICs. * Fix NVLS graph search when we have more than one NIC per GPU. Fix crash with collnetDirect * Add separate graph search for collnetDirect, testing alltoall paths and working similarly to the NVLS search. Fix hang when nodes have different CPU types * Add the CPU type to the rank peer info. * Align all ranks on the CPU type after the first allgather. * Only use the aligned CPU type for all tuning operations. Fixes issue #1136 Fixes issue #1184 Fix performance of registered send/recv operations * Allow for single full size operations * Add INFO to confirm the registration of send/recv buffers. Move all sync ops to finalize stage * Ensure ncclCommDestroy is non-blocking if ncclCommFinalize has been called. Improve error reporting during SHM segment creation Improve support of various compilers Merges PR #1177 Merges PR #1228 Allow net and tuner plugins to be statically linked * Search for ncclNet or ncclTuner symbols in the main binary. Merges PR #979 Plugin examples includes cleanup * Harmonize err.h and common.h usage. * Add mixed plugin with both net and tuner.
This commit is contained in:
@@ -0,0 +1,15 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef COMMON_H_
|
||||
#define COMMON_H_
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,17 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_ERR_H_
|
||||
#define NCCL_ERR_H_
|
||||
|
||||
/* Error type for plugins */
|
||||
typedef enum { ncclSuccess = 0,
|
||||
ncclUnhandledCudaError = 1,
|
||||
ncclSystemError = 2,
|
||||
ncclInternalError = 3,
|
||||
ncclInvalidArgument = 4,
|
||||
ncclInvalidUsage = 5,
|
||||
ncclRemoteError = 6 } ncclResult_t;
|
||||
|
||||
#endif
|
||||
@@ -8,15 +8,24 @@
|
||||
#ifndef NCCL_TUNER_H_
|
||||
#define NCCL_TUNER_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
#include "common.h"
|
||||
#include "err.h"
|
||||
|
||||
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
|
||||
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
|
||||
typedef enum {
|
||||
ncclFuncBroadcast = 0,
|
||||
ncclFuncReduce = 1,
|
||||
ncclFuncAllGather = 2,
|
||||
ncclFuncReduceScatter = 3,
|
||||
ncclFuncAllReduce = 4,
|
||||
ncclFuncSendRecv = 5,
|
||||
ncclFuncSend = 6,
|
||||
ncclFuncRecv = 7,
|
||||
ncclNumFuncs = 8
|
||||
} ncclFunc_t;
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
|
||||
#define NCCL_ALGO_UNDEF -1
|
||||
@@ -33,6 +42,8 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed
|
||||
#define NCCL_PROTO_LL128 1
|
||||
#define NCCL_PROTO_SIMPLE 2
|
||||
|
||||
#define NCCL_ALGO_PROTO_IGNORE -1.0
|
||||
|
||||
// API to be implemented by external tuner
|
||||
typedef struct {
|
||||
// Name of the tuner
|
||||
@@ -52,31 +63,33 @@ typedef struct {
|
||||
// - context: tuner context object
|
||||
// - collType: collective type , e.g., allreduce, allgather…
|
||||
// - nBytes: collective size in bytes
|
||||
// - collNetSupport: whether collnet supports this type
|
||||
// - nvlsSupport: whether nvlink sharp supports this time
|
||||
// - numPipeOps: number of operations in the group
|
||||
// - numAlgo: number of algorithms in collCostTable
|
||||
// - numProto: number of protocols in collCostTable
|
||||
//
|
||||
// Outputs:
|
||||
// - algorithm: selected algorithm to be used for the given collective
|
||||
// - protocol: selected protocol to be used for the given collective
|
||||
// - nChannels: number of channels (hence SMs) to be used.
|
||||
//
|
||||
// InOut:
|
||||
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
|
||||
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
|
||||
//
|
||||
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
|
||||
// default tuning for the given collective.
|
||||
// Also, the plugin is allowed to not set any output, or set only the
|
||||
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int *algorithm, int *protocol, int* nChannels);
|
||||
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||
int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v2_t;
|
||||
} ncclTuner_v3_t;
|
||||
|
||||
typedef ncclTuner_v2_t ncclTuner_t;
|
||||
typedef ncclTuner_v3_t ncclTuner_t;
|
||||
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
|
||||
|
||||
#endif
|
||||
|
||||
@@ -11,14 +11,21 @@
|
||||
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
|
||||
|
||||
__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
|
||||
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||
int* nChannels) {
|
||||
// Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
|
||||
if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
|
||||
collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
|
||||
}
|
||||
*nChannels = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
|
||||
|
||||
#define PLUGIN_NAME "Example"
|
||||
|
||||
const ncclTuner_v2_t ncclTunerPlugin_v2 = {
|
||||
const ncclTuner_v3_t ncclTunerPlugin_v3 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.getCollInfo = pluginGetCollInfo,
|
||||
|
||||
Referens i nytt ärende
Block a user