2023-09-26 05:47:28 -07:00
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
# include "tuner.h"
# define __hidden __attribute__ ((visibility("hidden")))
2024-07-29 15:43:36 -04:00
# define HOPPER_COMPCAP_IDX 2
// NVLink, PCI, Network
# define NCCL_HW_NVLINK 0
# define NCCL_HW_PCI 1
# define NCCL_HW_NET 2
static long log2i ( long n ) {
long l = 0 ;
while ( n > > = 1 ) l + + ;
return l ;
}
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static const float baseLat [ NCCL_NUM_ALGORITHMS ] [ NCCL_NUM_PROTOCOLS ] = {
{ 12.0 , 12.0 , 17.0 } , { 12.0 , 12.0 , 17.0 } , // Tree, Ring
{ 12.0 , 12.0 , 17.0 } , { 12.0 , 12.0 , 17.0 } , // Collnet Direct, Chain
{ 0 , 0 , 0 } , { 0 , 0 , 0 } } ; // NVLS, NVLS Tree
struct tuningModel {
float hwLat [ 3 ] [ NCCL_NUM_ALGORITHMS ] [ NCCL_NUM_PROTOCOLS ] ;
float bwRatio [ 2 ] [ NCCL_NUM_ALGORITHMS ] [ NCCL_NUM_PROTOCOLS ] ;
float treeCorrectionFactor [ NCCL_NUM_PROTOCOLS ] [ 27 ] ;
float ringCorrectionFactor [ NCCL_NUM_PROTOCOLS ] [ 27 ] ;
} ;
static struct tuningModel tuning_model = {
{
/* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { 0.8 , 0.0 , 2.5 } , /* Ring (LL/LL128/Simple)*/ { 0.8 , 0.0 , 3.6 } , /* CollNetDirect (Simple)*/ { 0.0 , 0.0 , 0.8 } , /* CollNetChain (Simple)*/ { 0.0 , 0.0 , 0.0 } , /* NVLS */ { 0 , 0 , 0 } , /* NVLS Tree */ { 0 , 0 , 0 } } ,
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 2.2 , 2.2 , 5.7 } , /* Ring (LL/LL128/Simple)*/ { 2.2 , 2.2 , 5.7 } , /* CollNetDirect (Simple)*/ { 0.0 , 0.0 , 5.7 } , /* CollNetChain (Simple)*/ { 0.0 , 0.0 , 5.7 } , /* NVLS */ { 0 , 0 , 0 } , /* NVLS Tree */ { 0 , 0 , 0 } } ,
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 12.5 , 0.0 , 22.4 } , /* Ring (LL/LL128/Simple)*/ { 9.5 , 0.0 , 19.8 } , /* CollNetDirect (Simple)*/ { 0.0 , 0.0 , 12.5 } , /* CollNetChain (Simple)*/ { 0.0 , 0.0 , 0.0 } , /* NVLS */ { 0 , 0 , 0 } , /* NVLS Tree */ { 0 , 0 , 0 } } ,
} ,
{
/* 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.41 , 0.00 , 1.00 } , /* Ring (LL/LL128/Simple)*/ { 0.41 , 0.00 , 1.00 } , /* CollNetDirect (Simple)*/ { 0.00 , 0.00 , 1.00 } , /* CollNetChain (Simple)*/ { 0.00 , 0.00 , 1.00 } , /* NVLS */ { 0 , 0 , 0 } , /* NVLS Tree */ { 0 , 0 , 0 } } ,
/* more than 2 nodes */
{ /* Tree (LL/LL128/Simple)*/ { 0.41 , 0.00 , 0.86 } , /* Ring (LL/LL128/Simple)*/ { 0.41 , 0.00 , 1.00 } , /* CollNetDirect (Simple)*/ { 0.00 , 0.00 , 1.00 } , /* CollNetChain (Simple)*/ { 0.00 , 0.00 , 1.00 } , /* NVLS */ { 0 , 0 , 0 } , /* NVLS Tree */ { 0 , 0 , 0 } } ,
} ,
{
{ 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 1.0 , 1.0 , 0.8 , 0.1 , 0.4 , 0.5 , 1.0 , 0.6 , 0.4 , 0.6 , 0.1 , 0.3 , 0.4 , 0.4 , 0.3 , 0.2 , 0.2 , 0.2 , 0.2 , 0.2 , 0.2 , 0.2 , } ,
{ 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , } ,
{ 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 1.0 , 1.0 , 1.0 , 0.4 , 1.0 , 1.0 , 1.0 , 0.2 , 0.7 , 1.0 , 1.0 , 1.0 , 0.8 , 0.7 , 0.7 , 0.8 , 0.8 , 0.8 , 0.9 , } ,
} ,
{
{ 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 1.0 , 0.1 , 0.2 , 0.2 , 0.1 , 0.5 , 0.8 , 1.0 , 0.2 , 0.4 , 0.5 , 0.4 , 0.4 , 0.3 , 0.2 , 0.2 , 0.2 , 0.2 , 0.2 , 0.2 , } ,
{ 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , } ,
{ 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 0.7 , 0.1 , 0.1 , 0.1 , 0.1 , 0.1 , 1.0 , 1.0 , 1.0 , 0.9 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , } ,
} ,
} ;
float latencies [ NCCL_NUM_FUNCTIONS ] [ NCCL_NUM_ALGORITHMS ] [ NCCL_NUM_PROTOCOLS ] ;
float bandwidths [ NCCL_NUM_FUNCTIONS ] [ NCCL_NUM_ALGORITHMS ] [ NCCL_NUM_PROTOCOLS ] ;
ncclResult_t ncclTopoGetAlgoTime_Tuner ( ncclFunc_t collType , int algorithm , int protocol , int numPipeOps , float * time , size_t nBytes ) {
float bw = bandwidths [ collType ] [ algorithm ] [ protocol ] ;
float lat = latencies [ collType ] [ algorithm ] [ protocol ] ;
if ( bw = = 0 ) {
* time = - 1.0 ; return ncclSuccess ;
}
int logSize = log2i ( nBytes > > 6 ) ;
if ( algorithm = = NCCL_ALGO_TREE ) {
if ( logSize < 27 ) bw * = tuning_model . treeCorrectionFactor [ protocol ] [ logSize ] ;
else bw * = tuning_model . treeCorrectionFactor [ protocol ] [ 26 ] ;
}
else if ( algorithm = = NCCL_ALGO_RING ) {
if ( logSize < 27 ) bw * = tuning_model . ringCorrectionFactor [ protocol ] [ logSize ] ;
else bw * = tuning_model . ringCorrectionFactor [ protocol ] [ 26 ] ;
}
int latCount = 1 ;
* time = lat * latCount + ( nBytes ) / ( 1000 * bw ) ;
return ncclSuccess ;
}
2025-01-23 11:48:18 -06:00
__hidden ncclResult_t pluginInit ( size_t nRanks , size_t nNodes , ncclDebugLogger_t logFunction ) {
2024-07-29 15:43:36 -04:00
if ( nRanks < = 1 ) return ncclSuccess ;
int compCapIndex = HOPPER_COMPCAP_IDX ;
int index2 = nNodes < = 2 ? nNodes - 1 : 2 ;
int index1 = nNodes = = 1 ? compCapIndex : 1 ;
float ppn = ( float ) nRanks / nNodes ; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
int intraHw [ NCCL_NUM_ALGORITHMS ] , hw [ NCCL_NUM_ALGORITHMS ] ;
for ( int a = 0 ; a < NCCL_NUM_ALGORITHMS ; a + + ) intraHw [ a ] = NCCL_HW_NVLINK ;
for ( int a = 0 ; a < NCCL_NUM_ALGORITHMS ; a + + ) hw [ a ] = nNodes = = 1 ? intraHw [ a ] : NCCL_HW_NET ;
for ( int coll = 0 ; coll < NCCL_NUM_FUNCTIONS ; coll + + ) {
int nsteps = coll = = ncclFuncAllReduce ? 2 * ( nRanks - 1 ) :
coll = = ncclFuncReduceScatter | | coll = = ncclFuncAllGather ? nRanks - 1 :
nRanks ;
int nInterSteps = coll = = ncclFuncAllReduce ? ( nNodes > 1 ? 2 * nNodes : 0 ) :
coll = = ncclFuncReduceScatter | | coll = = ncclFuncAllGather ? nNodes - 1 :
nNodes ;
for ( int a = 0 ; a < NCCL_NUM_ALGORITHMS ; a + + ) {
if ( coll = = ncclFuncBroadcast & & a ! = NCCL_ALGO_RING ) continue ;
if ( coll = = ncclFuncReduce & & a ! = NCCL_ALGO_RING ) continue ;
if ( coll = = ncclFuncReduceScatter & & a ! = NCCL_ALGO_RING & & a ! = NCCL_ALGO_NVLS & & a ! = NCCL_ALGO_COLLNET_DIRECT ) continue ;
if ( coll = = ncclFuncAllGather & & a ! = NCCL_ALGO_RING & & a ! = NCCL_ALGO_NVLS & & a ! = NCCL_ALGO_COLLNET_DIRECT ) continue ;
for ( int p = 0 ; p < NCCL_NUM_PROTOCOLS ; p + + ) {
if ( a = = NCCL_ALGO_TREE & & p = = NCCL_PROTO_SIMPLE & & nNodes = = 1 ) continue ;
if ( ( a = = NCCL_ALGO_NVLS | | a = = NCCL_ALGO_NVLS_TREE ) & & p ! = NCCL_PROTO_SIMPLE ) continue ;
int collnet = ( a = = NCCL_ALGO_COLLNET_DIRECT | | a = = NCCL_ALGO_COLLNET_CHAIN ) ? 1 : 0 ;
float bw = nNodes < = 2 | | collnet ? 12.0 : 12.0 ; //graphs[a]->bwIntra : graphs[a]->bwInter
if ( a = = NCCL_ALGO_NVLS ) bw = 0.0 ;
if ( a = = NCCL_ALGO_NVLS_TREE ) bw = 0.0 ;
if ( collnet = = 1 ) bw = 0.0 ;
int nChannels = 28 ; //nNodes==1 && MI300
float busBw = nChannels * bw ; //comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw
// Various model refinements
if ( nNodes < = 2 )
busBw * = tuning_model . bwRatio [ 0 ] [ a ] [ p ] ;
else
busBw * = tuning_model . bwRatio [ 1 ] [ a ] [ p ] ;
if ( a = = NCCL_ALGO_RING & & p = = NCCL_PROTO_LL & & ( coll = = ncclFuncBroadcast | | coll = = ncclFuncReduce ) & & nNodes = = 1 ) { busBw = busBw * 1.65 ; }
// Convert bus BW to algorithm BW
if ( ! ( a = = NCCL_ALGO_COLLNET_DIRECT & & ( coll = = ncclFuncAllGather | | coll = = ncclFuncReduceScatter ) ) ) {
float ratio = 1.0f ;
if ( a = = NCCL_ALGO_RING ) ratio * = ( 1.0 * nRanks ) / nsteps ;
else if ( a = = NCCL_ALGO_NVLS | | a = = NCCL_ALGO_NVLS_TREE ) ratio * = 5.0 / 6.0 ;
else ratio * = .5 ;
busBw * = ratio ;
}
bandwidths [ coll ] [ a ] [ p ] = busBw ;
latencies [ coll ] [ a ] [ p ] = baseLat [ a ] [ p ] ;
float intraLat = tuning_model . hwLat [ intraHw [ a ] ] [ a ] [ p ] ;
float interLat = tuning_model . hwLat [ NCCL_HW_NET ] [ a ] [ p ] ;
if ( a = = NCCL_ALGO_RING ) {
float lat = tuning_model . hwLat [ hw [ a ] ] [ a ] [ p ] ;
if ( ( coll = = ncclFuncReduce | | coll = = ncclFuncBroadcast ) ) {
latencies [ coll ] [ a ] [ p ] + = lat ;
} else {
// Inter-node rings still have to launch nsteps * net overhead.
float netOverhead = 0.0 ;
if ( nNodes > 1 ) {
netOverhead = 1 ;
if ( p = = NCCL_PROTO_SIMPLE ) netOverhead * = 3 ;
}
if ( intraLat < netOverhead ) intraLat = netOverhead ;
latencies [ coll ] [ a ] [ p ] + = ( nsteps - nInterSteps ) * intraLat + nInterSteps * interLat ;
}
} else if ( a = = NCCL_ALGO_TREE ) {
latencies [ coll ] [ a ] [ p ] + =
2 * ( ( nRanks / nNodes - 1 ) * intraLat + log2i ( nNodes ) * interLat ) ;
} else if ( a = = NCCL_ALGO_COLLNET_DIRECT ) {
int minimum = 1 ;
if ( ( nRanks / nNodes - 1 ) < 1 ) minimum = ( nRanks / nNodes - 1 ) ;
latencies [ coll ] [ a ] [ p ] + =
2 * ( minimum * intraLat + ( nRanks / nNodes - 1 ) * 0.4 ) + interLat ; // Add 0.4 us arity serialization latency
} else if ( a = = NCCL_ALGO_COLLNET_CHAIN ) {
latencies [ coll ] [ a ] [ p ] + = 2 * ( nRanks / nNodes - 1 ) * intraLat + interLat ;
} else if ( a = = NCCL_ALGO_NVLS ) {
if ( nNodes > 1 ) latencies [ coll ] [ a ] [ p ] + = tuning_model . hwLat [ NCCL_HW_NET ] [ a ] [ p ] ;
} else if ( a = = NCCL_ALGO_NVLS_TREE ) {
latencies [ coll ] [ a ] [ p ] + = 2 * ( nNodes - 1 ) * tuning_model . hwLat [ NCCL_HW_NET ] [ a ] [ p ] ;
}
}
}
}
// Protocols/Algorithms enable/disable, and user overrides.
// All are enabled except ll128 which is enabled by default only in certain cases.
int protoEnable [ NCCL_NUM_PROTOCOLS ] = { 1 , 2 , 1 } ;
int algoEnable [ NCCL_NUM_ALGORITHMS ] = { 1 , 1 , 1 , 1 , 1 , 1 } ;
// MNNVL: NVLS not yet supported
algoEnable [ NCCL_ALGO_NVLS_TREE ] = 0 ;
algoEnable [ NCCL_ALGO_COLLNET_DIRECT ] = 0 ;
algoEnable [ NCCL_ALGO_COLLNET_CHAIN ] = 0 ;
algoEnable [ NCCL_ALGO_NVLS ] = 0 ;
2023-09-26 05:47:28 -07:00
2024-07-29 15:43:36 -04:00
for ( int c = 0 ; c < NCCL_NUM_FUNCTIONS ; c + + ) for ( int a = 0 ; a < NCCL_NUM_ALGORITHMS ; a + + ) for ( int p = 0 ; p < NCCL_NUM_PROTOCOLS ; p + + ) {
int pEnable = protoEnable [ p ] ;
if ( p = = NCCL_PROTO_LL128 ) {
pEnable = 0 ;
}
if ( pEnable = = 0 ) bandwidths [ c ] [ a ] [ p ] = 0 ;
if ( algoEnable [ a ] = = 0 ) bandwidths [ c ] [ a ] [ p ] = 0 ;
}
return ncclSuccess ;
}
2023-09-26 05:47:28 -07:00
2024-03-26 06:08:55 -07:00
__hidden ncclResult_t pluginGetCollInfo ( void * context , ncclFunc_t collType , size_t nBytes ,
2023-09-26 05:47:28 -07:00
int collNetSupport , int nvlsSupport , int numPipeOps ,
2024-07-29 15:43:36 -04:00
int * algorithm , int * protocol , int * nChannels ) {
float minTime = 3600000000.0 ; // Hopefully no operation will take an hour to complete.
// Find algorithm / protocol.
* algorithm = - 1 ;
* protocol = - 1 ;
int nAlgos = NCCL_NUM_ALGORITHMS ;
for ( int a = 0 ; a < nAlgos ; a + + ) {
if ( ( a = = NCCL_ALGO_COLLNET_DIRECT | | a = = NCCL_ALGO_COLLNET_CHAIN ) & & collNetSupport ! = 1 ) continue ;
if ( ( a = = NCCL_ALGO_NVLS | | a = = NCCL_ALGO_NVLS_TREE ) & & nvlsSupport ! = 1 ) continue ;
if ( a = = NCCL_ALGO_NVLS & & collNetSupport ! = 1 ) continue ;
for ( int p = 0 ; p < NCCL_NUM_PROTOCOLS ; p + + ) {
if ( p = = NCCL_PROTO_LL128 ) continue ;
float time ;
ncclTopoGetAlgoTime_Tuner ( collType , a , p , numPipeOps , & time , nBytes ) ;
if ( time > = 0 & & time < minTime ) {
* algorithm = a ;
* protocol = p ;
minTime = time ;
}
}
}
return ncclSuccess ;
}
2023-09-26 05:47:28 -07:00
2024-03-26 06:08:55 -07:00
__hidden ncclResult_t pluginDestroy ( void * context ) { return ncclSuccess ; }
2023-09-26 05:47:28 -07:00
# define PLUGIN_NAME "Example"
2024-12-18 08:26:06 -08:00
const ncclTuner_v4_t ncclTunerPlugin_v4 = {
2023-09-26 05:47:28 -07:00
. name = PLUGIN_NAME ,
. init = pluginInit ,
. getCollInfo = pluginGetCollInfo ,
. destroy = pluginDestroy
} ;