3ea7eedf3b
Improvements for GB200 systems * Optimize the network performance by alternating the direction of the rings and the NIC to GPU assignment across communicators to limit unnecessary sharing. * Fix the detection of C2C links in case GPU Direct RDMA is disabled between a GPU and a NIC. * Fix PXN support on MNNVL systems, where NCCL would try (and fail) to share regular host memory across multiple nodes. * Fix P2C (PXN over C2C), which is now preferred over regular PXN. This support is currently preliminary and is disabled by default; use NCCL_PXN_C2C=1 to enable. Further reduce the overheads of CUDA graph capturing, which increased in NCCL 2.26.2 for large graphs. Optimize the network performance on DGX B200 systems by adjusting the bandwidths provided to the graph search algorithm. Enable fp8 reductions in symmetric kernels on Blackwell with CUDA 12.8. Restore the plugin name handling logic to make it possible to specify a path to the plugin (Issue #1732). Restore the ability to change NCCL_COLLNET_ENABLE during execution (Issue #1741). Add an example tuner plugin with CSV-based overrides. Remove an x86 dependency from the example profiler.
579 строки
27 KiB
C++
579 строки
27 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "core.h"
|
|
#include "device.h"
|
|
#include "comm.h"
|
|
#include "topo.h"
|
|
|
|
NCCL_PARAM(Nthreads, "NTHREADS", -2);
|
|
NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);
|
|
|
|
static int getNthreads(const char* name, int env, int min, int max, int def) {
|
|
int nt = env;
|
|
if (nt > 0) {
|
|
if (nt % WARP_SIZE != 0) {
|
|
INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE);
|
|
nt = max;
|
|
} else if (nt > max) {
|
|
INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (maximum %d).", name, nt, max);
|
|
nt = max;
|
|
} else if (nt < min) {
|
|
INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (minimum %d).", name, nt, min);
|
|
nt = min;
|
|
}
|
|
} else {
|
|
nt = def;
|
|
}
|
|
return nt;
|
|
}
|
|
|
|
// Parse a map of prefixes to a list of elements. The first prefix is
|
|
// optional and, if not present, the list of elements will be applied
|
|
// to all prefixes. Only the first list of elements can lack a
|
|
// prefix. Prefixes (if present) are followed by a colon. Lists of
|
|
// elements are comma delimited. Mappings of prefix to the lists of
|
|
// elements are semi-colon delimited.
|
|
//
|
|
// For example:
|
|
//
|
|
// NCCL_ALGO="ring,collnetdirect;allreduce:tree,collnetdirect;broadcast:ring"
|
|
// Enable ring and collnetdirect for all functions, then select tree
|
|
// and collnetdirect for allreduce and ring for broadcast.
|
|
//
|
|
// NCCL_PROTO="LL,Simple;allreduce:^LL"
|
|
// Enable LL and Simple for all functions, but everything except LL
|
|
// for allreduce.
|
|
//
|
|
// NCCL_PROTO="^LL128;allreduce:LL128"
|
|
// Enable everything but LL128, but only LL128 for allreduce.
|
|
ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes, const char* elems[], int nelems, int* list) {
|
|
ncclResult_t ret = ncclSuccess;
|
|
char* fullStr = strdup(str);
|
|
char* tmpFullStr;
|
|
char* fullToken = strtok_r(fullStr, ";", &tmpFullStr);
|
|
char* subToken = nullptr;
|
|
char* tokStr = nullptr;
|
|
while (fullToken) {
|
|
subToken = strdup(fullToken);
|
|
char* tmpSubStr;
|
|
char* prefix = strtok_r(subToken, ":", &tmpSubStr);
|
|
char* elemList = strtok_r(NULL, ":", &tmpSubStr);
|
|
if (elemList == NULL) {
|
|
if (fullToken != fullStr) {
|
|
// It makes no sense for any entry other than the first to not have a prefix,
|
|
// because then all the prefixes before the prefix-less entry would be
|
|
// overwritten.
|
|
WARN("All entries except the first must have a prefix: \"%s\"", str);
|
|
ret = ncclInvalidUsage;
|
|
goto fail;
|
|
}
|
|
elemList = prefix;
|
|
prefix = NULL;
|
|
}
|
|
|
|
int unset, set;
|
|
if (elemList[0] == '^') {
|
|
unset = 1; set = 0; elemList++;
|
|
} else {
|
|
unset = 0; set = 1;
|
|
}
|
|
|
|
bool foundPrefix = false;
|
|
for (int p=0; p<nprefixes; p++) {
|
|
if (prefix && strcasecmp(prefix, prefixElems[p]) != 0) continue;
|
|
foundPrefix = true;
|
|
for (int e=0; e<nelems; e++) list[p*nelems+e] = unset;
|
|
|
|
tokStr = strdup(elemList);
|
|
char* tmpStr;
|
|
char* elem = strtok_r(tokStr, ",", &tmpStr);
|
|
while (elem) {
|
|
int e;
|
|
for (e=0; e<nelems; e++) {
|
|
if (strcasecmp(elem, elems[e]) == 0) {
|
|
list[p*nelems+e] = set;
|
|
break;
|
|
}
|
|
}
|
|
if (e==nelems) {
|
|
WARN("Unrecognized element token \"%s\" when parsing \"%s\"", elem, str);
|
|
ret = ncclInvalidUsage;
|
|
goto fail;
|
|
}
|
|
elem = strtok_r(NULL, ",", &tmpStr);
|
|
}
|
|
free(tokStr);
|
|
tokStr = nullptr;
|
|
}
|
|
if (!foundPrefix) {
|
|
WARN("Unrecognized prefix token \"%s\" when parsing \"%s\"", prefix, str);
|
|
ret = ncclInvalidUsage;
|
|
goto fail;
|
|
}
|
|
free(subToken);
|
|
subToken = nullptr;
|
|
|
|
fullToken = strtok_r(NULL, ";", &tmpFullStr);
|
|
}
|
|
|
|
exit:
|
|
free(tokStr);
|
|
free(subToken);
|
|
free(fullStr);
|
|
return ret;
|
|
fail:
|
|
goto exit;
|
|
}
|
|
|
|
// Latencies in us, Bandwidths in GB/s
|
|
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
|
|
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
|
|
{ 6.8, 14.0, 8.4 }, { 6.6, 14.0, 8.4 }, // Tree, Ring
|
|
{ 0, 0, 0 }, { 0, 0, 0 }, // Collnet Direct, Chain
|
|
{ 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree
|
|
|
|
// NVLink, PCI, Network
|
|
#define NCCL_HW_NVLINK 0
|
|
#define NCCL_HW_PCI 1
|
|
#define NCCL_HW_NET 2
|
|
static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
|
{ /* NVLINK */
|
|
{ /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 4.0 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
|
|
/* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
|
|
/* NVLS */ { 0, 0, 25 }, /* NVLSTree */ { 0, 0, 25 } },
|
|
/* PCI */
|
|
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 4.0 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
|
|
/* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
|
|
/* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
|
|
/* NET */
|
|
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 14 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
|
|
/* CollNetDirect (Simple)*/ { 0, 0, 31 }, /* CollNetChain (Simple)*/ { 0, 0, 30 },
|
|
/* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 14 } }
|
|
};
|
|
|
|
/* Array indexes used below */
|
|
#define VOLTA_COMPCAP_IDX 0
|
|
#define AMPERE_COMPCAP_IDX 1
|
|
#define HOPPER_COMPCAP_IDX 2
|
|
#define BLACKWELL_COMPCAP_IDX 3
|
|
|
|
// LL128 max BW per channel
|
|
static const double llMaxBws[][3] = {
|
|
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
|
|
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
|
|
/* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0},
|
|
/* Blackwell-N1/AMD-N2/AMD-N4) */ {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0},
|
|
};
|
|
|
|
static const double perChMaxRingLL128Bws[][3] = {
|
|
/* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
|
|
/* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
|
|
/* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
|
|
/* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*36.7},
|
|
};
|
|
static const double perChMaxTreeLL128Bws[][3] = {
|
|
/* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
|
|
/* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
|
|
/* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
|
|
/* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*29.0},
|
|
};
|
|
static const double perChMaxTreeBws[][3] = {
|
|
/* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
|
|
/* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
|
|
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
|
|
/* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0},
|
|
};
|
|
|
|
NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
|
|
static int ncclPatEnable(struct ncclComm* comm) {
|
|
int patEnable = ncclParamPatEnable();
|
|
if (comm->minCompCap < 60) return 0; // Need SM60 or higher for CUDA atomics
|
|
if (patEnable != 2) return patEnable;
|
|
if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node
|
|
if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0; // PAT doesn't support net device offload
|
|
return 1;
|
|
}
|
|
|
|
// Network post overhead in ns (1000 = 1 us)
|
|
NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
|
|
|
|
static float getNetOverhead(struct ncclComm* comm) {
|
|
if (ncclParamNetOverhead() != -2) return ncclParamNetOverhead() * .001;
|
|
if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
|
|
if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
|
|
return 1.0;
|
|
}
|
|
|
|
NCCL_PARAM(Ll128C2c, "LL128_C2C", 1);
|
|
|
|
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
|
|
int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
|
|
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
|
|
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
|
|
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
|
|
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
|
|
comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
|
|
comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] =
|
|
comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] =
|
|
comm->maxThreads[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] = NCCL_MAX_NTHREADS;
|
|
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] =
|
|
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
|
|
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
|
|
getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
|
|
|
|
int nNodes = comm->nNodes;
|
|
int nRanks = comm->nRanks;
|
|
if (nRanks <= 1) return ncclSuccess;
|
|
|
|
int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
|
|
int index2 = nNodes <= 2 ? nNodes-1 : 2;
|
|
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
|
|
int index1 = nNodes == 1 ? compCapIndex :
|
|
(comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD || comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) ? 1 : 0;
|
|
double llMaxBw = llMaxBws[index1][index2];
|
|
double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
|
|
double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
|
|
double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
|
|
// De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
|
|
if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
|
|
float ppn = (float)nRanks / nNodes;
|
|
|
|
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
|
|
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
|
|
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
|
|
|
|
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
|
|
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
|
|
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
|
|
nRanks;
|
|
|
|
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
|
if ((coll == ncclFuncBroadcast || coll == ncclFuncReduce) && a != NCCL_ALGO_RING) continue;
|
|
if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
|
|
&& a != NCCL_ALGO_PAT && a != NCCL_ALGO_RING
|
|
&& a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
|
|
if (coll == ncclFuncAllReduce && a == NCCL_ALGO_PAT) continue;
|
|
|
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
|
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
|
|
if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
|
|
&& a == NCCL_ALGO_PAT && (p != NCCL_PROTO_SIMPLE || ncclPatEnable(comm) == 0)) continue;
|
|
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
|
|
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
|
|
if (a == NCCL_ALGO_NVLS) {
|
|
if (coll == ncclFuncAllReduce) {
|
|
bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
|
|
} else {
|
|
// allgather and reducescatter
|
|
bw = std::min(graphs[a]->bwIntra * (ppn - 1.0f) / ppn, graphs[a]->bwInter * 0.9f);
|
|
}
|
|
}
|
|
if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2);
|
|
float busBw = graphs[a]->nChannels * bw;
|
|
|
|
// Various model refinements
|
|
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); }
|
|
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
|
|
if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
|
|
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
|
|
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
|
|
if (a == NCCL_ALGO_TREE && comm->maxTreePattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
|
|
if (a == NCCL_ALGO_PAT) busBw *= .75;
|
|
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
|
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
|
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
|
|
if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
|
|
busBw = ppn * std::min(graphs[a]->bwIntra, graphs[a]->bwInter * 0.9f);
|
|
} else {
|
|
// Collnet+Direct requires all GPUs to have a local NIC to work at full speed
|
|
float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio
|
|
factor -= (factor-1)/2;
|
|
busBw /= factor;
|
|
if (minCompCap >= 90) busBw *= .85;
|
|
}
|
|
}
|
|
// disable collnet for allgather/reducescatter if #localranks > #heads
|
|
// AllGather/ReduceScatter requires 1:1 GPU:NIC
|
|
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_COLLNET_DIRECT) && p == NCCL_PROTO_SIMPLE && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) && comm->nNodes > 1) {
|
|
int nHeads = 0;
|
|
if (coll == ncclFuncAllGather && comm->nNodes > 1 && (!comm->ncclCollNet || !comm->ncclCollNet->iallgather)) busBw = 0.0f;
|
|
if (coll == ncclFuncReduceScatter && comm->nNodes > 1 && (!comm->ncclCollNet || !comm->ncclCollNet->ireducescatter)) busBw = 0.0f;
|
|
if (comm->config.collnetEnable)
|
|
nHeads = comm->collNetHeadsNum;
|
|
else
|
|
busBw = 0.0f;
|
|
if (busBw > 0.0f) {
|
|
for (int r = 0; r < comm->nRanks; r++) {
|
|
int node = comm->rankToNode[r];
|
|
if (comm->nodeRanks[node].localRanks > nHeads) {
|
|
busBw = 0.0f;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Convert bus BW to algorithm BW
|
|
if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
|
|
float ratio = 1.0f;
|
|
if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
|
|
else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
|
|
else ratio *= .5;
|
|
busBw *= ratio;
|
|
}
|
|
comm->bandwidths[coll][a][p] = busBw;
|
|
comm->latencies[coll][a][p] = baseLat[a][p];
|
|
float intraLat = hwLat[intraHw[a]][a][p];
|
|
// With ppn=1 latencies are fully exposed, use the Tree network latency
|
|
float interLat = ppn == 1 ? hwLat[NCCL_HW_NET][NCCL_ALGO_TREE][p] : hwLat[NCCL_HW_NET][a][p];
|
|
interLat += graphs[a]->latencyInter;
|
|
// Also add the flush extra latency
|
|
if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter;
|
|
|
|
if (a == NCCL_ALGO_RING) {
|
|
float lat = hwLat[hw[a]][a][p];
|
|
if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
|
|
if (graphs[a]->sameChannels) {
|
|
comm->latencies[coll][a][p] += lat;
|
|
} else {
|
|
if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
|
|
comm->latencies[coll][a][p] += nsteps*lat;
|
|
}
|
|
} else {
|
|
// Inter-node rings still have to launch nsteps * net overhead.
|
|
float netOverhead = 0.0;
|
|
if (nNodes > 1) {
|
|
netOverhead = getNetOverhead(comm);
|
|
if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3;
|
|
}
|
|
intraLat = std::max(intraLat, netOverhead);
|
|
int nInterSteps = nNodes == 1 ? 0 : coll == ncclFuncAllReduce ? 2*(nNodes-1) : nNodes-1;
|
|
comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
|
|
}
|
|
} else if (a == NCCL_ALGO_TREE) {
|
|
if (coll == ncclFuncAllReduce) {
|
|
comm->latencies[coll][a][p] +=
|
|
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
|
|
}
|
|
} else if (a == NCCL_ALGO_COLLNET_DIRECT) {
|
|
comm->latencies[coll][a][p] +=
|
|
2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.4) + interLat; // Add 0.4 us arity serialization latency
|
|
} else if (a == NCCL_ALGO_COLLNET_CHAIN) {
|
|
comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat;
|
|
} else if (a == NCCL_ALGO_NVLS) {
|
|
comm->latencies[coll][a][p] = intraLat;
|
|
if (nNodes > 1) comm->latencies[coll][a][p] += interLat;
|
|
} else if (a == NCCL_ALGO_NVLS_TREE) {
|
|
comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat;
|
|
} else if (a == NCCL_ALGO_PAT) {
|
|
if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
|
|
comm->latencies[coll][a][p] = 8 // Base time
|
|
+ log2i(nNodes) * (interLat/3.5) // Log latency
|
|
+ nRanks * 2.8; // Still a linear part; hopefully we'll manage to remove it at some point.
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Protocols/Algorithms enable/disable, and user overrides.
|
|
// All are enabled except ll128 which is enabled by default only in certain cases.
|
|
int protoEnable[NCCL_NUM_FUNCTIONS*NCCL_NUM_PROTOCOLS];
|
|
int algoEnable[NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS];
|
|
for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
|
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
|
protoEnable[f*NCCL_NUM_PROTOCOLS+p] = p == NCCL_PROTO_LL128 ? 2 : 1;
|
|
}
|
|
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
|
algoEnable[f*NCCL_NUM_ALGORITHMS+a] = 1;
|
|
}
|
|
}
|
|
|
|
const char *protoStr = ncclGetEnv("NCCL_PROTO");
|
|
if (protoStr) {
|
|
INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr);
|
|
NCCLCHECK(parseList(protoStr, ncclFuncStr, NCCL_NUM_FUNCTIONS, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
|
|
}
|
|
const char *algoStr = ncclGetEnv("NCCL_ALGO");
|
|
if (algoStr) {
|
|
INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
|
|
NCCLCHECK(parseList(algoStr, ncclFuncStr, NCCL_NUM_FUNCTIONS, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
|
|
}
|
|
|
|
if (comm->rank == 0 && (algoStr||protoStr)) {
|
|
constexpr int strLength = 1024;
|
|
char funcAlgoProtoTuningStr[strLength];
|
|
int offset = 0;
|
|
offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n Function | ");
|
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
|
offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%8s ", ncclProtoStr[p]);
|
|
}
|
|
offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), " | ");
|
|
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
|
offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13s ", ncclAlgoStr[a]);
|
|
}
|
|
offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n");
|
|
|
|
for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
|
|
offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13s | ", ncclFuncStr[f]);
|
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
|
offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%8d ", protoEnable[f*NCCL_NUM_PROTOCOLS+p]);
|
|
}
|
|
offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), " | ");
|
|
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
|
offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13d ", algoEnable[f*NCCL_NUM_ALGORITHMS+a]);
|
|
}
|
|
offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n");
|
|
}
|
|
|
|
INFO(NCCL_ENV, "Enabled NCCL Func/Proto/Algo Matrix:%s", funcAlgoProtoTuningStr);
|
|
}
|
|
|
|
int nvsCount = 0;
|
|
NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount));
|
|
|
|
for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
|
|
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
|
int disable = 0;
|
|
// Disable NVLS Tree on a single node
|
|
if (comm->nNodes == 1 && a == NCCL_ALGO_NVLS_TREE) disable = 1;
|
|
// Disable Collnet+Direct, Collnet+Chain or Collnet+NVLS if collnet is not supported.
|
|
if (comm->config.collnetEnable == 0 &&
|
|
(a == NCCL_ALGO_COLLNET_DIRECT ||
|
|
a == NCCL_ALGO_COLLNET_CHAIN ||
|
|
(a == NCCL_ALGO_NVLS && comm->nNodes > 1))) disable = 1;
|
|
// Disable CollNet+Direct if not on an NVSwitch system
|
|
if (nvsCount == 0 && a == NCCL_ALGO_COLLNET_DIRECT) disable = 1;
|
|
if (disable) algoEnable[f*NCCL_NUM_ALGORITHMS+a] = 0;
|
|
}
|
|
}
|
|
|
|
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
|
int pEnable = protoEnable[c*NCCL_NUM_PROTOCOLS+p];
|
|
if (pEnable == 2 && p == NCCL_PROTO_LL128) {
|
|
pEnable = 1;
|
|
if (ncclParamLl128C2c() && minCompCap >= 90) {
|
|
// Enable LL128 by default only on Hopper/Blackwell for all connections up to P2C and PXN.
|
|
pEnable &= (graphs[a]->typeInter <= PATH_PXN);
|
|
} else {
|
|
// Enable LL128 only up to PXB. Don't enable LL128 over PxN because PxN can encapsulate PxB or P2C links.
|
|
pEnable &= (graphs[a]->typeInter <= PATH_PXB);
|
|
if (!ncclParamLl128C2c() && minCompCap >= 90)
|
|
INFO(NCCL_GRAPH, "Disabling LL128 over all PxN connections (PXB and C2C). This ensures that no C2C link will be used by LL128.");
|
|
}
|
|
pEnable &= (graphs[a]->typeIntra <= PATH_NVB);
|
|
pEnable &= (minCompCap == maxCompCap);
|
|
pEnable &= !(minCompCap < 70 || (minCompCap == 90 && CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2));
|
|
}
|
|
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
|
|
if (algoEnable[c*NCCL_NUM_ALGORITHMS+a] == 0) comm->bandwidths[c][a][p] = 0;
|
|
}
|
|
|
|
if (comm->rank == 0) {
|
|
constexpr int lineLen = 1024;
|
|
char line[lineLen];
|
|
int offset = 0;
|
|
for (int block=0; block<DIVUP(NCCL_NUM_ALGORITHMS, 3); block++) {
|
|
offset = snprintf(line, lineLen, " Algorithm |");
|
|
for (int ba=0; ba<3; ba++) {
|
|
int a = block*3+ba;
|
|
if (a >= NCCL_NUM_ALGORITHMS) continue;
|
|
offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14s %14s %14s |", "", ncclAlgoStr[a], "");
|
|
}
|
|
INFO(NCCL_TUNING, "%s", line);
|
|
offset = snprintf(line, lineLen, " Protocol |");
|
|
for (int ba=0; ba<3; ba++) {
|
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
|
offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14s |", ncclProtoStr[p]);
|
|
}
|
|
}
|
|
INFO(NCCL_TUNING, "%s", line);
|
|
offset = snprintf(line, lineLen, " Max NThreads |");
|
|
for (int ba=0; ba<3; ba++) {
|
|
int a = block*3+ba;
|
|
if (a >= NCCL_NUM_ALGORITHMS) continue;
|
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
|
offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14d |", comm->maxThreads[a][p]);
|
|
}
|
|
}
|
|
INFO(NCCL_TUNING, "%s", line);
|
|
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
|
|
offset = snprintf(line, lineLen, "%13s |", ncclFuncStr[c]);
|
|
for (int ba=0; ba<3; ba++) {
|
|
int a = block*3+ba;
|
|
if (a >= NCCL_NUM_ALGORITHMS) continue;
|
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
|
offset += snprintf(line+offset, std::max(0, lineLen-offset), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
|
|
}
|
|
}
|
|
INFO(NCCL_TUNING, "%s", line);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Set per-thread amount of work before we increase nThreads and nChannels
|
|
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
|
comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD;
|
|
comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
|
|
comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
|
|
}
|
|
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
|
|
comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = 512;
|
|
comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = 512;
|
|
|
|
// Override defaults with user env
|
|
const char* str = ncclGetEnv("NCCL_THREAD_THRESHOLDS");
|
|
if (str) {
|
|
INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
|
|
ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }};
|
|
sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
|
|
for (int a=0; a<2; a++) {
|
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
|
if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p];
|
|
}
|
|
}
|
|
}
|
|
|
|
INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld | %ld",
|
|
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL],
|
|
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128],
|
|
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE],
|
|
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL],
|
|
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128],
|
|
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE],
|
|
comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE],
|
|
comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE]);
|
|
return ncclSuccess;
|
|
}
|
|
|
|
// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
|
|
// factor is not ideal but works quite well. Powers of two, 64 B to 256MB.
|
|
static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
|
|
{ 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .4, .4, .5, .6, .7, .8, .9, 1.0, 1.0, 1.0, 1.0 },
|
|
{ 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .7, .6, .6, .6, .6, .6, .6, .8, .9, .9, .9, .9, 1.0, 1.0 },
|
|
{ .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .6, .7, .8, .7, .7, .8, .9, .9 }
|
|
};
|
|
|
|
ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time) {
|
|
float bw = comm->bandwidths[coll][algorithm][protocol];
|
|
float lat = comm->latencies[coll][algorithm][protocol];
|
|
|
|
if (bw == 0) {
|
|
*time = -1.0; return ncclSuccess;
|
|
}
|
|
int logSize = log2i(nBytes>>6);
|
|
if (algorithm == NCCL_ALGO_TREE && coll == ncclFuncAllReduce && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
|
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1
|
|
&& coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) {
|
|
lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
|
|
}
|
|
// Tree pipelining saves latency in aggregation cases
|
|
int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_DEV_WORK_BATCH_COLLS);
|
|
*time = lat * latCount + nBytes / (1000 * bw);
|
|
return ncclSuccess;
|
|
}
|