2
0
Ficheiros
rocm-systems/src/graph/topo.h
T
Sylvain Jeaugey 920dbe5b35 2.8.3-1
Optimization for Tree allreduce on A100.
Improve aggregation performance.
Use shared buffers for inter-node send/recv.
Add NVTX profiling hooks.
Accelerate alltoall connections by merging communication for all
channels.
Add support for one hop communication through NVLink, for faster
send/recv communication on cubemesh topologies like DGX-1.
Improve alltoall scheduling to better balance intra/inter node
communication.
Increase send/recv parallelism by 8x, each warp sending or
receiving to a different peer.
Net: move to v4.
Net: make flush operation asynchronous to accelerate alltoall.
Net: define maximum number of requests.
Fix hang when using LL128 protocol after 2^31 steps.
Fix #379 : topology injection failing when using less GPUs than
described in the XML.
Fix #394 : protocol mismatch causing hangs or crashes when using
one GPU per node.
2020-11-17 11:08:52 -08:00

173 linhas
4.8 KiB
C

/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_TOPO_H_
#define NCCL_TOPO_H_
#include "graph.h"
#include "core.h"
#include <sched.h>
#define LOC_WIDTH 5000.0
#define SM60_NVLINK_WIDTH 18.0
#define SM70_NVLINK_WIDTH 21.0
#define SM80_NVLINK_WIDTH 21.0
#define SM86_NVLINK_WIDTH 12.0
#define PCI_WIDTH 12.0 // PCI Gen3 x16
#define QPI_WIDTH 6.0
#define SKL_QPI_WIDTH 9.0
#define P9_WIDTH 32.0
#define ARM_WIDTH 6.0
#define NET_WIDTH 12.0 // 100Gbit
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
// to GPU traffic consumes more PCI bandwidth.
#define INTEL_P2P(speed) (speed*9/12)
#define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
#define NCCL_TOPO_NODE_TYPES 7
#define GPU 0
#define PCI 1
#define NVS 2
#define CPU 3 // Actually NUMA domains
#define NIC 4
#define NET 5
extern const char* topoNodeTypeStr[];
// We want link types and path types to match as much as possible
#define LINK_LOC 0
#define LINK_NVL 1
// Skipping 2 for PATH_NVB
#define LINK_PCI 3
// Skipping 4 for PATH_PXB
// Skipping 5 for PATH_PHB
#define LINK_SYS 6
#define LINK_NET 7
extern const char* topoLinkTypeStr[];
#define PATH_LOC 0
#define PATH_NVL 1
#define PATH_NVB 2
#define PATH_PIX 3
#define PATH_PXB 4
#define PATH_PHB 5
#define PATH_SYS 6
extern const char* topoPathTypeStr[];
struct ncclTopoNode;
struct ncclTopoLink {
int type;
float width;
struct ncclTopoNode* remNode;
};
#define NCCL_TOPO_MAX_LINKS 32
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
struct ncclTopoLinkList {
struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
int count;
float width;
int type;
};
#define NCCL_TOPO_CPU_INTEL_BDW 1
#define NCCL_TOPO_CPU_INTEL_SKL 2
#define NCCL_TOPO_UNDEF (-1)
struct ncclTopoNode {
int type;
int64_t id;
// Type specific data
union {
struct {
int dev; // NVML dev number
int rank;
int cudaCompCap;
int gdrSupport;
}gpu;
struct {
uint64_t asic;
int port;
float width;
int gdrSupport;
int collSupport;
int maxChannels;
}net;
struct {
int arch;
int vendor;
int model;
cpu_set_t affinity;
}cpu;
};
int nlinks;
struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
// Pre-computed paths to GPUs and NICs
struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES];
// Used during search
uint64_t used;
};
struct ncclTopoNodeSet {
int count;
struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES];
};
struct ncclTopoSystem {
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
float maxWidth;
float totalWidth;
};
ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id);
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width);
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr);
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax);
static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
*index = -1;
for (int i=0; i<system->nodes[type].count; i++) {
if (system->nodes[type].nodes[i].id == id) {
*index = i;
return ncclSuccess;
}
}
return ncclInternalError;
}
static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) {
*index = -1;
for (int i=0; i<system->nodes[GPU].count; i++) {
if (system->nodes[GPU].nodes[i].gpu.rank == rank) {
*index = i;
return ncclSuccess;
}
}
return ncclInternalError;
}
// Returns NVLink speed in GB/s
static float ncclTopoNVLinkSpeed(int cudaCompCap) {
return
cudaCompCap == 86 ? SM86_NVLINK_WIDTH :
cudaCompCap >= 80 ? SM80_NVLINK_WIDTH :
cudaCompCap >= 70 ? SM70_NVLINK_WIDTH :
cudaCompCap >= 60 ? SM60_NVLINK_WIDTH :
SM80_NVLINK_WIDTH;
}
#endif