920dbe5b35
Optimization for Tree allreduce on A100. Improve aggregation performance. Use shared buffers for inter-node send/recv. Add NVTX profiling hooks. Accelerate alltoall connections by merging communication for all channels. Add support for one hop communication through NVLink, for faster send/recv communication on cubemesh topologies like DGX-1. Improve alltoall scheduling to better balance intra/inter node communication. Increase send/recv parallelism by 8x, each warp sending or receiving to a different peer. Net: move to v4. Net: make flush operation asynchronous to accelerate alltoall. Net: define maximum number of requests. Fix hang when using LL128 protocol after 2^31 steps. Fix #379 : topology injection failing when using less GPUs than described in the XML. Fix #394 : protocol mismatch causing hangs or crashes when using one GPU per node.
173 linhas
4.8 KiB
C
173 linhas
4.8 KiB
C
/*************************************************************************
|
|
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_TOPO_H_
|
|
#define NCCL_TOPO_H_
|
|
|
|
#include "graph.h"
|
|
#include "core.h"
|
|
#include <sched.h>
|
|
|
|
#define LOC_WIDTH 5000.0
|
|
#define SM60_NVLINK_WIDTH 18.0
|
|
#define SM70_NVLINK_WIDTH 21.0
|
|
#define SM80_NVLINK_WIDTH 21.0
|
|
#define SM86_NVLINK_WIDTH 12.0
|
|
#define PCI_WIDTH 12.0 // PCI Gen3 x16
|
|
#define QPI_WIDTH 6.0
|
|
#define SKL_QPI_WIDTH 9.0
|
|
#define P9_WIDTH 32.0
|
|
#define ARM_WIDTH 6.0
|
|
#define NET_WIDTH 12.0 // 100Gbit
|
|
|
|
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
|
|
// to GPU traffic consumes more PCI bandwidth.
|
|
#define INTEL_P2P(speed) (speed*9/12)
|
|
#define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
|
|
|
|
#define NCCL_TOPO_NODE_TYPES 7
|
|
#define GPU 0
|
|
#define PCI 1
|
|
#define NVS 2
|
|
#define CPU 3 // Actually NUMA domains
|
|
#define NIC 4
|
|
#define NET 5
|
|
extern const char* topoNodeTypeStr[];
|
|
|
|
// We want link types and path types to match as much as possible
|
|
#define LINK_LOC 0
|
|
#define LINK_NVL 1
|
|
// Skipping 2 for PATH_NVB
|
|
#define LINK_PCI 3
|
|
// Skipping 4 for PATH_PXB
|
|
// Skipping 5 for PATH_PHB
|
|
#define LINK_SYS 6
|
|
#define LINK_NET 7
|
|
extern const char* topoLinkTypeStr[];
|
|
|
|
#define PATH_LOC 0
|
|
#define PATH_NVL 1
|
|
#define PATH_NVB 2
|
|
#define PATH_PIX 3
|
|
#define PATH_PXB 4
|
|
#define PATH_PHB 5
|
|
#define PATH_SYS 6
|
|
extern const char* topoPathTypeStr[];
|
|
|
|
struct ncclTopoNode;
|
|
struct ncclTopoLink {
|
|
int type;
|
|
float width;
|
|
struct ncclTopoNode* remNode;
|
|
};
|
|
#define NCCL_TOPO_MAX_LINKS 32
|
|
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
|
|
|
|
struct ncclTopoLinkList {
|
|
struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
|
|
int count;
|
|
float width;
|
|
int type;
|
|
};
|
|
|
|
#define NCCL_TOPO_CPU_INTEL_BDW 1
|
|
#define NCCL_TOPO_CPU_INTEL_SKL 2
|
|
|
|
#define NCCL_TOPO_UNDEF (-1)
|
|
|
|
struct ncclTopoNode {
|
|
int type;
|
|
int64_t id;
|
|
// Type specific data
|
|
union {
|
|
struct {
|
|
int dev; // NVML dev number
|
|
int rank;
|
|
int cudaCompCap;
|
|
int gdrSupport;
|
|
}gpu;
|
|
struct {
|
|
uint64_t asic;
|
|
int port;
|
|
float width;
|
|
int gdrSupport;
|
|
int collSupport;
|
|
int maxChannels;
|
|
}net;
|
|
struct {
|
|
int arch;
|
|
int vendor;
|
|
int model;
|
|
cpu_set_t affinity;
|
|
}cpu;
|
|
};
|
|
int nlinks;
|
|
struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
|
|
// Pre-computed paths to GPUs and NICs
|
|
struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES];
|
|
// Used during search
|
|
uint64_t used;
|
|
};
|
|
|
|
struct ncclTopoNodeSet {
|
|
int count;
|
|
struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES];
|
|
};
|
|
|
|
struct ncclTopoSystem {
|
|
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
|
|
float maxWidth;
|
|
float totalWidth;
|
|
};
|
|
|
|
ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
|
|
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
|
|
ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id);
|
|
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width);
|
|
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
|
|
ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
|
|
|
|
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr);
|
|
|
|
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
|
|
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
|
|
ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
|
|
|
|
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax);
|
|
|
|
static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
|
|
*index = -1;
|
|
for (int i=0; i<system->nodes[type].count; i++) {
|
|
if (system->nodes[type].nodes[i].id == id) {
|
|
*index = i;
|
|
return ncclSuccess;
|
|
}
|
|
}
|
|
return ncclInternalError;
|
|
}
|
|
|
|
static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) {
|
|
*index = -1;
|
|
for (int i=0; i<system->nodes[GPU].count; i++) {
|
|
if (system->nodes[GPU].nodes[i].gpu.rank == rank) {
|
|
*index = i;
|
|
return ncclSuccess;
|
|
}
|
|
}
|
|
return ncclInternalError;
|
|
}
|
|
|
|
// Returns NVLink speed in GB/s
|
|
static float ncclTopoNVLinkSpeed(int cudaCompCap) {
|
|
return
|
|
cudaCompCap == 86 ? SM86_NVLINK_WIDTH :
|
|
cudaCompCap >= 80 ? SM80_NVLINK_WIDTH :
|
|
cudaCompCap >= 70 ? SM70_NVLINK_WIDTH :
|
|
cudaCompCap >= 60 ? SM60_NVLINK_WIDTH :
|
|
SM80_NVLINK_WIDTH;
|
|
}
|
|
#endif
|