Merge pull request #172 from wenkaidu/topo_expl
Add topology explorer
[ROCm/rccl commit: 5b3856f2ed]
Этот коммит содержится в:
@@ -30,9 +30,15 @@ const char* topoLinkTypeStr[] = { "LOC", "XGMI", "PCI", "QPI", "NET" };
|
||||
const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
|
||||
#endif
|
||||
|
||||
#ifdef TOPO_EXPL
|
||||
#include "model.h"
|
||||
extern NodeModel *node_model;
|
||||
#endif
|
||||
|
||||
/******************************************************************/
|
||||
/******************* Graph Creation Functions *********************/
|
||||
/******************************************************************/
|
||||
#ifndef TOPO_EXPL
|
||||
static int getNumaId(char *path) {
|
||||
char npath[PATH_MAX];
|
||||
snprintf(npath, PATH_MAX, "%s/numa_node", path);
|
||||
@@ -59,6 +65,15 @@ static ncclResult_t getPciPath(char* busId, char** path) {
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
#else
|
||||
static int getNumaId(char *path) {
|
||||
return node_model->getNumaId(path);
|
||||
}
|
||||
|
||||
static ncclResult_t getPciPath(char* busId, char** path) {
|
||||
return node_model->getGpuPciPath(busId, path);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
|
||||
ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
|
||||
@@ -102,6 +117,7 @@ int interCpuWidth = 0;
|
||||
int cpuPciWidth = 0;
|
||||
int p2pPciWidth = 0;
|
||||
|
||||
#ifndef TOPO_EXPL
|
||||
static ncclResult_t getCpuWidths() {
|
||||
// Check if already detected
|
||||
if (interCpuWidth + cpuPciWidth + p2pPciWidth) return ncclSuccess;
|
||||
@@ -182,6 +198,14 @@ static ncclResult_t getCpuWidths() {
|
||||
INFO(NCCL_GRAPH, "%s CPU (CPU-PCI %d, PCI/P2P %d, InterCpu %d)", cpu, cpuPciWidth, p2pPciWidth, interCpuWidth);
|
||||
return ncclSuccess;
|
||||
}
|
||||
#else
|
||||
static ncclResult_t getCpuWidths() {
|
||||
char cpu[256];
|
||||
node_model->getCpuWidths(cpu, &interCpuWidth, &cpuPciWidth, &p2pPciWidth);
|
||||
TRACE(NCCL_GRAPH, "%s CPU (CPU-PCI %d, PCI/P2P %d, InterCpu %d)", cpu, cpuPciWidth, p2pPciWidth, interCpuWidth);
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
|
||||
NCCLCHECK(getCpuWidths());
|
||||
@@ -272,7 +296,11 @@ ncclResult_t ncclTopoConnectXGMI(struct ncclComm* comm, struct ncclTopoSystem* s
|
||||
uint32_t link_type, hops;
|
||||
int cudaDev1 = busIdToCudaDev(comm->peerInfo[gpu1->rank].busId);
|
||||
int cudaDev2 = busIdToCudaDev(comm->peerInfo[gpu2->rank].busId);
|
||||
#ifndef TOPO_EXPL
|
||||
if (hipExtGetLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) == hipSuccess) {
|
||||
#else
|
||||
if (node_model->getLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) == hipSuccess) {
|
||||
#endif
|
||||
if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1) {
|
||||
NCCLCHECK(ncclTopoConnectNodes(gpu1, gpu2, LINK_NVL, minWidth));
|
||||
}
|
||||
@@ -424,6 +452,7 @@ ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTop
|
||||
// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
|
||||
#include <glob.h>
|
||||
#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
|
||||
#ifndef TOPO_EXPL
|
||||
uint64_t getIbGuid(char* path) {
|
||||
uint64_t guid = 0ULL;
|
||||
char guidPath[PATH_MAX];
|
||||
@@ -446,6 +475,11 @@ uint64_t getIbGuid(char* path) {
|
||||
}
|
||||
return guid;
|
||||
}
|
||||
#else
|
||||
uint64_t getIbGuid(char* path) {
|
||||
return node_model->getIbGuid(path);
|
||||
}
|
||||
#endif
|
||||
|
||||
struct netInfo {
|
||||
char* path;
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
HIP_PATH ?= $(wildcard /opt/rocm/hip)
|
||||
ifeq (,$(HIP_PATH))
|
||||
HIP_PATH = ../../..
|
||||
endif
|
||||
HIPCC = $(HIP_PATH)/bin/hipcc
|
||||
|
||||
EXE = topo_expl
|
||||
CXXFLAGS = -g -O3 -Iinclude -I../../src/include -I../../src/graph/ -DTOPO_EXPL -DENABLE_TRACE
|
||||
|
||||
files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc ../../src/graph/search.cc ../../src/graph/connect.cc
|
||||
|
||||
all: $(EXE)
|
||||
|
||||
$(EXE): $(files)
|
||||
$(HIPCC) $(CXXFLAGS) $^ -o $@
|
||||
|
||||
clean:
|
||||
rm -f *.o $(EXE)
|
||||
@@ -0,0 +1,419 @@
|
||||
/*
|
||||
Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef MODEL_H_
|
||||
#define MODEL_H_
|
||||
|
||||
class CpuDevices {
|
||||
private:
|
||||
char *cpuName;
|
||||
int interCpuWidth;
|
||||
int cpuPciWidth;
|
||||
int p2pPciWidth;
|
||||
|
||||
public:
|
||||
CpuDevices(const char *cpuname, const int intercpuwidth, const int cpupciwidth, const int p2ppciwidth) :
|
||||
cpuName((char *)cpuname), interCpuWidth(intercpuwidth), cpuPciWidth(cpupciwidth), p2pPciWidth(p2ppciwidth) {}
|
||||
|
||||
CpuDevices() : cpuName(0), interCpuWidth(0), cpuPciWidth(0), p2pPciWidth(0) {}
|
||||
|
||||
ncclResult_t getCpuWidths(char* name, int* interCpu, int* cpuPci, int* p2pPci) {
|
||||
strcpy(name, cpuName);
|
||||
*interCpu = interCpuWidth;
|
||||
*cpuPci = cpuPciWidth;
|
||||
*p2pPci = p2pPciWidth;
|
||||
return ncclSuccess;
|
||||
}
|
||||
};
|
||||
|
||||
class GpuDevices {
|
||||
private:
|
||||
int nGpus;
|
||||
uint64_t *busIds;
|
||||
char **gpuPciPaths;
|
||||
int *gpuNumaIds;
|
||||
int *connMatrix;
|
||||
|
||||
public:
|
||||
GpuDevices(const int ngpus, const uint64_t *busids, const char **gpupcipaths, const int *gpunumaids, const int *connmatrix) :
|
||||
nGpus(ngpus), busIds((uint64_t *)busids), gpuPciPaths((char **)gpupcipaths), gpuNumaIds((int *)gpunumaids), connMatrix((int *)connmatrix) {}
|
||||
|
||||
GpuDevices () : nGpus(0), busIds(0), gpuPciPaths(0), gpuNumaIds(0), connMatrix(0) {}
|
||||
|
||||
int getnDevs() { return nGpus; }
|
||||
|
||||
uint64_t getBusId(int dev) { return busIds[dev]; }
|
||||
|
||||
ncclResult_t getPciPath(char* busId, char** path) {
|
||||
char tempBusId[] = "0000:00:00.0";
|
||||
*path = (char *)malloc(PATH_MAX);
|
||||
int i;
|
||||
for (i = 0; i < nGpus; i++) {
|
||||
NCCLCHECK(int64ToBusId(busIds[i], tempBusId));
|
||||
if (strcmp(busId, tempBusId) == 0)
|
||||
break;
|
||||
}
|
||||
if (i < nGpus)
|
||||
strcpy(*path, gpuPciPaths[i]);
|
||||
else {
|
||||
WARN("Could not find real path of %s", busId);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int p2pCanConnect(int device1, int device2) {
|
||||
// connection matrix are 8 GPUs
|
||||
int dist = connMatrix[device1*8+device2];
|
||||
if (dist == 255)
|
||||
return 0;
|
||||
//if (dist%15 == 0 && dist/15 != 1) {
|
||||
// return 0;
|
||||
//}
|
||||
return 1;
|
||||
};
|
||||
|
||||
hipError_t getLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) {
|
||||
// connection matrix are 8 GPUs
|
||||
int dist = connMatrix[device1*8+device2];
|
||||
|
||||
if (dist%15 == 0) {
|
||||
*linktype = 4;
|
||||
*hopcount = dist/15;
|
||||
}
|
||||
else if (dist%20 == 0) {
|
||||
*linktype = 2;
|
||||
*hopcount = dist/20;
|
||||
}
|
||||
else if (dist%36 == 0) {
|
||||
*linktype = 1;
|
||||
*hopcount = dist/36;
|
||||
}
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
virtual int getNumaId(char *path) {
|
||||
int n;
|
||||
// search for all GPUs
|
||||
for (n = 0; n < nGpus; n++)
|
||||
if (strcmp(path, gpuPciPaths[n]) == 0)
|
||||
break;
|
||||
if (n < nGpus)
|
||||
return gpuNumaIds[n];
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
class NetDevices {
|
||||
private:
|
||||
int nNetDevs;
|
||||
char **netPciPaths;
|
||||
uint64_t *netGuids; // IB ports on same card share the same GUID
|
||||
int *netNumaIds;
|
||||
|
||||
public:
|
||||
NetDevices(const int nnetdevs, const char **netpcipaths, const uint64_t *netguids, const int *netnumaids) :
|
||||
nNetDevs(nnetdevs), netPciPaths((char **)netpcipaths), netGuids((uint64_t *)netguids), netNumaIds((int *)netnumaids) {}
|
||||
|
||||
NetDevices() : nNetDevs(0), netPciPaths(0), netGuids(0), netNumaIds(0) {}
|
||||
|
||||
int getnDevs() { return nNetDevs; }
|
||||
|
||||
ncclResult_t getPciPath(int dev, char** path) {
|
||||
*path = (char *)malloc(PATH_MAX);
|
||||
if (dev < nNetDevs)
|
||||
strcpy(*path, netPciPaths[dev]);
|
||||
else {
|
||||
WARN("Could not find real path of %d", dev);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
virtual int getNumaId(char *path) {
|
||||
int n;
|
||||
// search for all NICs
|
||||
for (n = 0; n < nNetDevs; n++)
|
||||
if (strcmp(path, netPciPaths[n]) == 0)
|
||||
break;
|
||||
if (n < nNetDevs)
|
||||
return netNumaIds[n];
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint64_t getIbGuid(char* path) {
|
||||
int n;
|
||||
for (n = 0; n < nNetDevs; n++)
|
||||
if (strcmp(path, netPciPaths[n]) == 0)
|
||||
break;
|
||||
if (n < nNetDevs)
|
||||
return netGuids[n];
|
||||
WARN("Invalid IB path %s", path);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
class NodeModel {
|
||||
private:
|
||||
CpuDevices cpus;
|
||||
GpuDevices gpus;
|
||||
NetDevices netdevs;
|
||||
|
||||
public:
|
||||
int nodeId;
|
||||
int currRank;
|
||||
int firstRank;
|
||||
uint64_t hostHash; // auto-generated
|
||||
uint64_t pidHash; // auto-generated
|
||||
char description[256];
|
||||
|
||||
int rankToCudaDev(int rank) { return rank - firstRank; }
|
||||
|
||||
int getnGpus() { return gpus.getnDevs(); }
|
||||
|
||||
int getnNetDevs() { return netdevs.getnDevs(); }
|
||||
|
||||
ncclResult_t getGpuPciPath(char* busId, char** path) {
|
||||
return gpus.getPciPath(busId, path);
|
||||
}
|
||||
|
||||
ncclResult_t getNetPciPath(int dev, char** path) {
|
||||
|
||||
return netdevs.getPciPath(dev, path);
|
||||
}
|
||||
|
||||
uint64_t getGpuBusId(int dev) {
|
||||
return gpus.getBusId(dev);
|
||||
}
|
||||
|
||||
int p2pCanConnect(int device1, int device2) { return gpus.p2pCanConnect(device1, device2); }
|
||||
|
||||
hipError_t getLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) {
|
||||
return gpus.getLinkTypeAndHopCount(device1, device2, linktype, hopcount);
|
||||
}
|
||||
|
||||
uint64_t getIbGuid(char* path) {
|
||||
return netdevs.getIbGuid(path);
|
||||
}
|
||||
|
||||
int shmCanConnect(int device1, int device2) { return 1; }
|
||||
int netCanConnect(int device1, int device2) { return 1; }
|
||||
|
||||
virtual int getNumaId(char *path) {
|
||||
int numa = gpus.getNumaId(path);
|
||||
if (numa != -1) return numa;
|
||||
numa = netdevs.getNumaId(path);
|
||||
if (numa != -1) return numa;
|
||||
WARN("Invalid path %s for getNumaId", path);
|
||||
return 0;
|
||||
}
|
||||
|
||||
virtual ncclResult_t getCpuWidths(char* name, int* interCpu, int* cpuPci, int* p2pPci) {
|
||||
return cpus.getCpuWidths(name, interCpu, cpuPci, p2pPci);
|
||||
}
|
||||
|
||||
NodeModel(CpuDevices cpu, GpuDevices gpu, NetDevices net, const char *desc) :
|
||||
cpus(cpu), gpus(gpu), netdevs(net) {
|
||||
strncpy(description, desc, 256);
|
||||
}
|
||||
|
||||
NodeModel() {}
|
||||
|
||||
~NodeModel() {}
|
||||
};
|
||||
|
||||
class NetworkModel {
|
||||
private:
|
||||
int nNodes;
|
||||
int nRanks;
|
||||
NodeModel nodes[NCCL_TOPO_MAX_NODES];
|
||||
|
||||
public:
|
||||
void AddNode(NodeModel node) {
|
||||
nodes[nNodes] = node;
|
||||
nodes[nNodes].nodeId = nNodes;
|
||||
nodes[nNodes].firstRank = nRanks;
|
||||
nodes[nNodes].hostHash = ((uint64_t)rand() << 32) | rand();
|
||||
nodes[nNodes].pidHash = ((uint64_t)rand() << 32) | rand();
|
||||
nNodes++;
|
||||
nRanks += node.getnGpus();
|
||||
}
|
||||
|
||||
int GetNNodes() { return nNodes; }
|
||||
|
||||
int GetNRanks() { return nRanks; }
|
||||
|
||||
NodeModel* GetNode(int rank) {
|
||||
int node_id;
|
||||
|
||||
if(rank < 0 || rank >= nRanks)
|
||||
return 0;
|
||||
|
||||
for(node_id = nNodes-1; node_id >= 0; node_id--)
|
||||
if(rank >= nodes[node_id].firstRank) break;
|
||||
|
||||
if (node_id >= 0) {
|
||||
nodes[node_id].currRank = rank;
|
||||
return nodes+node_id;
|
||||
}
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
NetworkModel() : nNodes(0), nRanks(0) {}
|
||||
};
|
||||
|
||||
|
||||
const static uint64_t busIds_8[] = { 0x1d000, 0x20000, 0x23000, 0x26000, 0x3f000, 0x43000, 0x46000, 0x49000 };
|
||||
|
||||
const static char* gpuPciPaths_8[] = {
|
||||
"/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:08.0/0000:1b:00.0/0000:1c:00.0/0000:1d:00.0",
|
||||
"/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:0c.0/0000:1e:00.0/0000:1f:00.0/0000:20:00.0",
|
||||
"/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:10.0/0000:21:00.0/0000:22:00.0/0000:23:00.0",
|
||||
"/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:14.0/0000:24:00.0/0000:25:00.0/0000:26:00.0",
|
||||
"/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:04.0/0000:3d:00.0/0000:3e:00.0/0000:3f:00.0",
|
||||
"/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:0c.0/0000:41:00.0/0000:42:00.0/0000:43:00.0",
|
||||
"/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:10.0/0000:44:00.0/0000:45:00.0/0000:46:00.0",
|
||||
"/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:14.0/0000:47:00.0/0000:48:00.0/0000:49:00.0",
|
||||
};
|
||||
|
||||
const static int gpuPciNumaIds_8[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
const static char* netPciPaths_1[] = {
|
||||
"/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:04.0/0000:1a:00.0",
|
||||
};
|
||||
|
||||
const static char* netPciPaths_1_1[] = {
|
||||
"/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:08.0/0000:4c:00.0",
|
||||
};
|
||||
|
||||
const static uint64_t netGuids_1[] = {
|
||||
0xb8599f030007053aL,
|
||||
};
|
||||
|
||||
const static int netPciNumaIds_1[] = { 0 };
|
||||
|
||||
const static char* netPciPaths_2[] = {
|
||||
"/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:04.0/0000:1a:00.0",
|
||||
"/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:08.0/0000:4c:00.0",
|
||||
};
|
||||
|
||||
const static uint64_t netGuids_2[] = {
|
||||
0xb8599f030007053aL,
|
||||
0x506b4b030027bbf2L,
|
||||
};
|
||||
|
||||
const static int netPciNumaIds_2[] = { 0, 0 };
|
||||
|
||||
const static uint64_t rome_busIds_8[] = { 0x63000, 0x23000, 0x26000, 0x03000, 0xe3000, 0xc3000, 0xc6000, 0xa3000 };
|
||||
|
||||
const static char* rome_gpuPciPaths_8[] = {
|
||||
"/sys/devices/pci0000:60/0000:60:03.1/0000:61:00.0/0000:62:00.0/0000:63:00.0",
|
||||
"/sys/devices/pci0000:20/0000:20:01.1/0000:21:00.0/0000:22:00.0/0000:23:00.0",
|
||||
"/sys/devices/pci0000:20/0000:20:03.1/0000:24:00.0/0000:25:00.0/0000:26:00.0",
|
||||
"/sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0",
|
||||
"/sys/devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/0000:e2:00.0/0000:e3:00.0",
|
||||
"/sys/devices/pci0000:c0/0000:c0:01.1/0000:c1:00.0/0000:c2:00.0/0000:c3:00.0",
|
||||
"/sys/devices/pci0000:c0/0000:c0:03.1/0000:c4:00.0/0000:c5:00.0/0000:c6:00.0",
|
||||
"/sys/devices/pci0000:a0/0000:a0:03.1/0000:a1:00.0/0000:a2:00.0/0000:a3:00.0",
|
||||
};
|
||||
|
||||
const static int rome_gpuPciNumaIds_8[] = { 0, 0, 0, 0, 4, 4, 4, 4 };
|
||||
|
||||
const static char* rome_netPciPaths_1[] = {
|
||||
"/sys/devices/pci0000:40/0000:40:01.1/0000:41:00.0",
|
||||
};
|
||||
|
||||
const static uint64_t rome_netGuids_1[] = {
|
||||
0xb8599f030007053aL,
|
||||
};
|
||||
|
||||
const static int rom_netPciNumaIds_1[] = { 0 };
|
||||
|
||||
const static char* rome_netPciPaths_2[] = {
|
||||
"/sys/devices/pci0000:40/0000:40:01.1/0000:41:00.0",
|
||||
"/sys/devices/pci0000:80/0000:80:01.1/0000:81:00.0",
|
||||
};
|
||||
|
||||
const static uint64_t rome_netGuids_2[] = {
|
||||
0xb8599f030007053aL,
|
||||
0x506b4b030027bbf2L,
|
||||
};
|
||||
|
||||
const static int rom_netPciNumaIds_2[] = { 0, 4 };
|
||||
|
||||
const int conn_mat_pcie[64] = {
|
||||
0 , 40, 40, 40, 40, 40, 40, 40,
|
||||
40, 0 , 40, 40, 40, 40, 40, 40,
|
||||
40, 40, 0 , 40, 40, 40, 40, 40,
|
||||
40, 40, 40, 0 , 40, 40, 40, 40,
|
||||
40, 40, 40, 40, 0 , 40, 40, 40,
|
||||
40, 40, 40, 40, 40, 0 , 40, 40,
|
||||
40, 40, 40, 40, 40, 40, 0 , 40,
|
||||
40, 40, 40, 40, 40, 40, 40, 0 ,
|
||||
};
|
||||
|
||||
const int conn_mat_4p2h[64] = {
|
||||
0 , 15, 15, 30, 40, 40, 40, 40,
|
||||
15, 0 , 30, 15, 40, 40, 40, 40,
|
||||
15, 30, 0 , 15, 40, 40, 40, 40,
|
||||
30, 15, 15, 0 , 40, 40, 40, 40,
|
||||
40, 40, 40, 40, 0 , 15, 15, 30,
|
||||
40, 40, 40, 40, 15, 0 , 30, 15,
|
||||
40, 40, 40, 40, 15, 30, 0 , 15,
|
||||
40, 40, 40, 40, 30, 15, 15, 0 ,
|
||||
};
|
||||
|
||||
const int conn_mat_8p6l[64] = {
|
||||
0 , 15, 15, 15, 15, 30, 15, 15,
|
||||
15, 0 , 15, 15, 30, 15, 15, 15,
|
||||
15, 15, 0 , 15, 15, 15, 15, 30,
|
||||
15, 15, 15, 0 , 15, 15, 30, 15,
|
||||
15, 30, 15, 15, 0 , 15, 15, 15,
|
||||
30, 15, 15, 15, 15, 0 , 15, 15,
|
||||
15, 15, 15, 30, 15, 15, 0 , 15,
|
||||
15, 15, 30, 15, 15, 15, 15, 0 ,
|
||||
};
|
||||
|
||||
const int conn_mat_8p6l_1[64] = {
|
||||
0 , 15, 15, 30, 15, 15, 15, 15,
|
||||
15, 0 , 30, 15, 15, 15, 15, 15,
|
||||
15, 30, 0 , 15, 15, 15, 15, 15,
|
||||
30, 15, 15, 0 , 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 0 , 15, 15, 30,
|
||||
15, 15, 15, 15, 15, 0 , 30, 15,
|
||||
15, 15, 15, 15, 15, 30, 0 , 15,
|
||||
15, 15, 15, 15, 30, 15, 15, 0 ,
|
||||
};
|
||||
|
||||
const int conn_mat_rome[64] = {
|
||||
0 , 40, 40, 40, 72, 72, 72, 72,
|
||||
40, 0 , 40, 40, 72, 72, 72, 72,
|
||||
40, 40, 0 , 40, 72, 72, 72, 72,
|
||||
40, 40, 40, 0 , 72, 72, 72, 72,
|
||||
72, 72, 72, 72, 0 , 40, 40, 40,
|
||||
72, 72, 72, 72, 40, 0 , 40, 40,
|
||||
72, 72, 72, 72, 40, 40, 0 , 40,
|
||||
72, 72, 72, 72, 40, 40, 40, 0 ,
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,267 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_H_
|
||||
#define NCCL_H_
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
#define NCCL_MAJOR 2
|
||||
#define NCCL_MINOR 5
|
||||
#define NCCL_PATCH 6
|
||||
#define NCCL_SUFFIX ""
|
||||
|
||||
#define NCCL_VERSION_CODE 2506
|
||||
#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
|
||||
|
||||
#define RCCL_BFLOAT16 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Opaque handle to communicator */
|
||||
typedef struct ncclComm* ncclComm_t;
|
||||
|
||||
#define NCCL_UNIQUE_ID_BYTES 128
|
||||
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
|
||||
|
||||
/* Error type */
|
||||
typedef enum { ncclSuccess = 0,
|
||||
ncclUnhandledCudaError = 1,
|
||||
ncclSystemError = 2,
|
||||
ncclInternalError = 3,
|
||||
ncclInvalidArgument = 4,
|
||||
ncclInvalidUsage = 5,
|
||||
ncclNumResults = 6 } ncclResult_t;
|
||||
|
||||
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
|
||||
* This integer is coded with the MAJOR, MINOR and PATCH level of the
|
||||
* NCCL library
|
||||
*/
|
||||
ncclResult_t ncclGetVersion(int *version);
|
||||
ncclResult_t pncclGetVersion(int *version);
|
||||
|
||||
/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
|
||||
* called once and the Id should be distributed to all ranks in the
|
||||
* communicator before calling ncclCommInitRank. */
|
||||
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
|
||||
/* Creates a new communicator (multi thread/process version).
|
||||
* rank must be between 0 and nranks-1 and unique within a communicator clique.
|
||||
* Each rank is associated to a CUDA device, which has to be set before calling
|
||||
* ncclCommInitRank.
|
||||
* ncclCommInitRank implicitly syncronizes with other ranks, so it must be
|
||||
* called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
|
||||
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
|
||||
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
|
||||
|
||||
/* Creates a clique of communicators (single process version).
|
||||
* This is a convenience function to create a single-process communicator clique.
|
||||
* Returns an array of ndev newly initialized communicators in comm.
|
||||
* comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
|
||||
* If devlist is NULL, the first ndev CUDA devices are used.
|
||||
* Order of devlist defines user-order of processors within the communicator. */
|
||||
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||
|
||||
/* Frees resources associated with communicator object, but waits for any operations
|
||||
* that might still be running on the device. */
|
||||
ncclResult_t ncclCommDestroy(ncclComm_t comm);
|
||||
ncclResult_t pncclCommDestroy(ncclComm_t comm);
|
||||
|
||||
/* Frees resources associated with communicator object and aborts any operations
|
||||
* that might still be running on the device. */
|
||||
ncclResult_t ncclCommAbort(ncclComm_t comm);
|
||||
ncclResult_t pncclCommAbort(ncclComm_t comm);
|
||||
|
||||
/* Returns a human-readable error message. */
|
||||
const char* ncclGetErrorString(ncclResult_t result);
|
||||
const char* pncclGetErrorString(ncclResult_t result);
|
||||
|
||||
/* Checks whether the comm has encountered any asynchronous errors */
|
||||
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||
|
||||
/* Gets the number of ranks in the communicator clique. */
|
||||
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
|
||||
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
|
||||
|
||||
/* Returns the cuda device number associated with the communicator. */
|
||||
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
|
||||
/* Returns the user-ordered "rank" associated with the communicator. */
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
|
||||
/* Reduction operation selector */
|
||||
typedef enum { ncclSum = 0,
|
||||
ncclProd = 1,
|
||||
ncclMax = 2,
|
||||
ncclMin = 3,
|
||||
ncclNumOps = 4 } ncclRedOp_t;
|
||||
|
||||
/* Data types */
|
||||
typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||
ncclUint8 = 1,
|
||||
ncclInt32 = 2, ncclInt = 2,
|
||||
ncclUint32 = 3,
|
||||
ncclInt64 = 4,
|
||||
ncclUint64 = 5,
|
||||
ncclFloat16 = 6, ncclHalf = 6,
|
||||
ncclFloat32 = 7, ncclFloat = 7,
|
||||
ncclFloat64 = 8, ncclDouble = 8,
|
||||
ncclBfloat16 = 9,
|
||||
ncclNumTypes = 10 } ncclDataType_t;
|
||||
|
||||
/*
|
||||
* Collective communication operations
|
||||
*
|
||||
* Collective communication operations must be called separately for each
|
||||
* communicator in a communicator clique.
|
||||
*
|
||||
* They return when operations have been enqueued on the CUDA stream.
|
||||
*
|
||||
* Since they may perform inter-CPU synchronization, each call has to be done
|
||||
* from a different thread or process, or need to use Group Semantics (see
|
||||
* below).
|
||||
*/
|
||||
|
||||
/*
|
||||
* Reduce
|
||||
*
|
||||
* Reduces data arrays of length count in sendbuff into recvbuff using op
|
||||
* operation.
|
||||
* recvbuff may be NULL on all calls except for root device.
|
||||
* root is the rank (not the CUDA device) where data will reside after the
|
||||
* operation is complete.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
/*
|
||||
* (deprecated) Broadcast (in-place)
|
||||
*
|
||||
* Copies count values from root to all other devices.
|
||||
* root is the rank (not the CUDA device) where data resides before the
|
||||
* operation is started.
|
||||
*
|
||||
* This operation is implicitely in place.
|
||||
*/
|
||||
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
/*
|
||||
* Broadcast
|
||||
*
|
||||
* Copies count values from root to all other devices.
|
||||
* root is the rank (not the CUDA device) where data resides before the
|
||||
* operation is started.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
/*
|
||||
* All-Reduce
|
||||
*
|
||||
* Reduces data arrays of length count in sendbuff using op operation, and
|
||||
* leaves identical copies of result on each recvbuff.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
/*
|
||||
* Reduce-Scatter
|
||||
*
|
||||
* Reduces data in sendbuff using op operation and leaves reduced result
|
||||
* scattered over the devices so that recvbuff on rank i will contain the i-th
|
||||
* block of the result.
|
||||
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
|
||||
* should have a size of at least nranks*recvcount elements.
|
||||
*
|
||||
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
|
||||
*/
|
||||
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
|
||||
/*
|
||||
* All-Gather
|
||||
*
|
||||
* Each device gathers sendcount values from other GPUs into recvbuff,
|
||||
* receiving data from rank i at offset i*sendcount.
|
||||
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
|
||||
* should have a size of at least nranks*sendcount elements.
|
||||
*
|
||||
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
|
||||
*/
|
||||
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
/*
|
||||
* Group semantics
|
||||
*
|
||||
* When managing multiple GPUs from a single thread, and since NCCL collective
|
||||
* calls may perform inter-CPU synchronization, we need to "group" calls for
|
||||
* different ranks/devices into a single call.
|
||||
*
|
||||
* Grouping NCCL calls as being part of the same collective operation is done
|
||||
* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
|
||||
* collective calls until the ncclGroupEnd call, which will wait for all calls
|
||||
* to be complete. Note that for collective communication, ncclGroupEnd only
|
||||
* guarantees that the operations are enqueued on the streams, not that
|
||||
* the operation is effectively done.
|
||||
*
|
||||
* Both collective communication and ncclCommInitRank can be used in conjunction
|
||||
* of ncclGroupStart/ncclGroupEnd.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Group Start
|
||||
*
|
||||
* Start a group call. All subsequent calls to NCCL may not block due to
|
||||
* inter-CPU synchronization.
|
||||
*/
|
||||
ncclResult_t ncclGroupStart();
|
||||
ncclResult_t pncclGroupStart();
|
||||
|
||||
/*
|
||||
* Group End
|
||||
*
|
||||
* End a group call. Wait for all calls since ncclGroupStart to complete
|
||||
* before returning.
|
||||
*/
|
||||
ncclResult_t ncclGroupEnd();
|
||||
ncclResult_t pncclGroupEnd();
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // end extern "C"
|
||||
#endif
|
||||
|
||||
#endif // end include guard
|
||||
@@ -0,0 +1,44 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef UTILS_H_
|
||||
#define UTILS_H_
|
||||
|
||||
struct allGather1Data_t {
|
||||
struct ncclPeerInfo peerInfo;
|
||||
struct ncclComm* comm;
|
||||
};
|
||||
|
||||
struct allGather3Data_t {
|
||||
int cudaCompCap;
|
||||
int fullCudaCompCap;
|
||||
int nvlink;
|
||||
int nChannels;
|
||||
struct {
|
||||
int sameChannels;
|
||||
int speedIntra;
|
||||
int speedInter;
|
||||
int nvlink;
|
||||
} tree;
|
||||
struct {
|
||||
int sameChannels;
|
||||
int speedIntra;
|
||||
int speedInter;
|
||||
int nvlink;
|
||||
} ring;
|
||||
struct ncclTopoRanks topoRanks;
|
||||
};
|
||||
|
||||
ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data);
|
||||
|
||||
ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data,
|
||||
struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph);
|
||||
|
||||
ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
|
||||
struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,216 @@
|
||||
/*
|
||||
Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "nccl.h"
|
||||
#include "channel.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "bootstrap.h"
|
||||
#include "transport.h"
|
||||
#include "group.h"
|
||||
#include "net.h"
|
||||
#include "graph.h"
|
||||
#include "argcheck.h"
|
||||
#include <sched.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
#include <dlfcn.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include "model.h"
|
||||
|
||||
extern NodeModel *node_model;
|
||||
|
||||
static ncclResult_t dummyNetDevices(int* ndev) {
|
||||
*ndev = node_model->getnNetDevs();
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t dummyNetPciPath(int dev, char** path) {
|
||||
node_model->getNetPciPath(dev, path);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclNet_t ncclNetDummy = {
|
||||
"IB",
|
||||
0,
|
||||
dummyNetDevices,
|
||||
dummyNetPciPath,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
};
|
||||
|
||||
ncclNet_t* ncclNet = &ncclNetDummy;
|
||||
|
||||
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
|
||||
int busIdToCudaDev(int64_t busId) {
|
||||
int cudaDev;
|
||||
|
||||
for (cudaDev = 0; cudaDev < node_model->getnGpus(); cudaDev++) {
|
||||
if (node_model->getGpuBusId(cudaDev) == busId)
|
||||
break;
|
||||
}
|
||||
|
||||
if (cudaDev < node_model->getnGpus())
|
||||
return cudaDev;
|
||||
else
|
||||
WARN("Invalid busId %lx", busId);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Determine if two peers can communicate with P2P */
|
||||
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
// Rule out different nodes
|
||||
*ret = 0;
|
||||
if (info1->hostHash != info2->hostHash) return ncclSuccess;
|
||||
int cudaDev1 = busIdToCudaDev(info1->busId);
|
||||
int cudaDev2 = busIdToCudaDev(info2->busId);
|
||||
*ret = node_model->p2pCanConnect(cudaDev1, cudaDev2);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Send: Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
if (myInfo->cudaDev == peerInfo->cudaDev) {
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
|
||||
return ncclInternalError;
|
||||
} else {
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
}
|
||||
} else {
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
//TRACE_DUMP_IPC(&info.devIpc);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclTransport p2pTransport = {
|
||||
"P2P",
|
||||
p2pCanConnect,
|
||||
{ p2pSendSetup, NULL, NULL, NULL },
|
||||
{ p2pRecvSetup, NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
/* Determine if two peers can communicate with SHM */
|
||||
ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
// Rule out different nodes
|
||||
*ret = 0;
|
||||
if (info1->hostHash != info2->hostHash) return ncclSuccess;
|
||||
int cudaDev1 = busIdToCudaDev(info1->busId);
|
||||
int cudaDev2 = busIdToCudaDev(info2->busId);
|
||||
*ret = node_model->shmCanConnect(cudaDev1, cudaDev2);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
|
||||
INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclTransport shmTransport = {
|
||||
"SHM",
|
||||
shmCanConnect,
|
||||
{ shmSendSetup, NULL, NULL, NULL },
|
||||
{ shmRecvSetup, NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
/* Determine if two peers can communicate with NET */
|
||||
ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = node_model->netCanConnect(info1->rank, info2->rank);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
|
||||
int netDev, useGdr = 0;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &netDev));
|
||||
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), netDev,
|
||||
useGdr ? "/GDRDMA" : "");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
|
||||
|
||||
ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
|
||||
int netDev, useGdr = 0;
|
||||
|
||||
NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &netDev));
|
||||
// Check if we are close enough that it makes sense to enable GDR
|
||||
int netGdrLevel = ncclParamNetGdrLevel();
|
||||
int distance;
|
||||
NCCLCHECK(ncclTopoNetDistance(topo, myInfo->busId, netDev, &distance));
|
||||
if (distance >= netGdrLevel) {
|
||||
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), myInfo->busId, netDev, distance, netGdrLevel);
|
||||
}
|
||||
else
|
||||
useGdr = 1;
|
||||
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), netDev,
|
||||
useGdr ? "/GDRDMA" : "");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclTransport netTransport = {
|
||||
"NET",
|
||||
netCanConnect,
|
||||
{ netSendSetup, NULL, NULL, NULL },
|
||||
{ netRecvSetup, NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
struct ncclTransport ncclTransports[NTRANSPORTS] = {
|
||||
p2pTransport,
|
||||
shmTransport,
|
||||
netTransport,
|
||||
};
|
||||
@@ -0,0 +1,231 @@
|
||||
/*
|
||||
Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "nccl.h"
|
||||
#include "channel.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "bootstrap.h"
|
||||
#include "transport.h"
|
||||
#include "group.h"
|
||||
#include "net.h"
|
||||
#include "graph.h"
|
||||
#include "argcheck.h"
|
||||
#include "cpuset.h"
|
||||
#include <sched.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
#include <dlfcn.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
#include "model.h"
|
||||
#include "utils.h"
|
||||
|
||||
NodeModel *node_model;
|
||||
|
||||
char* getCmdOption(char ** begin, char ** end, const std::string & option) {
|
||||
char ** itr = std::find(begin, end, option);
|
||||
if (itr != end && ++itr != end)
|
||||
{
|
||||
return *itr;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool cmdOptionExists(char** begin, char** end, const std::string& option) {
|
||||
return std::find(begin, end, option) != end;
|
||||
}
|
||||
|
||||
const char *model_descriptions[] = {
|
||||
"4 nodes with 8 GPUs PCIe 1 NIC",
|
||||
"4 nodes with 8 GPUs PCIe 2 NIC",
|
||||
"2 nodes VEGA20 4P1H",
|
||||
"4 nodes with 8 VEGA20 GPUs XGMI 4P2H 1 NIC",
|
||||
"single node gfx908 4P3L",
|
||||
"single node gfx908 8P6L",
|
||||
"single node gfx908 8P6L Alt. Connection",
|
||||
"single node 8 GPUs PCIe on Rome",
|
||||
"4 nodes 8 GPUs PCIe 2 NICs on Rome",
|
||||
"3 nodes 8 GPUs PCIe + 1 Rome 8 GPUs PCIe + 2 nodes gfx908 4P3L",
|
||||
NULL,
|
||||
};
|
||||
|
||||
int main(int argc,char* argv[])
|
||||
{
|
||||
struct ncclComm *comm;
|
||||
|
||||
if (!cmdOptionExists(argv, argv + argc, "-m")) {
|
||||
printf("Usage: ./topo_expl -m model_id\n");
|
||||
printf("List of model_id:\n");
|
||||
for (int i = 0; model_descriptions[i] != NULL; i++)
|
||||
printf(" %d: %s\n", i, model_descriptions[i]);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
int model_id = 0;
|
||||
char *mi = getCmdOption(argv, argv + argc, "-m");
|
||||
if (mi)
|
||||
model_id = atol(mi);
|
||||
|
||||
// CPU, GPU and NIC devices on Skylake
|
||||
CpuDevices skylake("Skylake", 12, 12, 12);
|
||||
GpuDevices vg20_pcie(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_pcie);
|
||||
GpuDevices vg20_4p1h(4, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_4p2h);
|
||||
GpuDevices vg20_4p2h(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_4p2h);
|
||||
GpuDevices gfx908_4p3l(4, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l);
|
||||
GpuDevices gfx908_8p6l(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l);
|
||||
GpuDevices gfx908_8p6l_1(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l_1);
|
||||
NetDevices nic_1(1, netPciPaths_1, netGuids_1, netPciNumaIds_1);
|
||||
NetDevices nic_1_1(1, netPciPaths_1_1, netGuids_1, netPciNumaIds_1);
|
||||
NetDevices nic_2(2, netPciPaths_2, netGuids_2, netPciNumaIds_2);
|
||||
|
||||
// CPU, GPU and NIC devices on Rome
|
||||
CpuDevices rome("Rome", 24, 24, 24);
|
||||
GpuDevices vg20_pcie_rome(8, rome_busIds_8, rome_gpuPciPaths_8, rome_gpuPciNumaIds_8, conn_mat_rome);
|
||||
NetDevices nic_1_rome(1, rome_netPciPaths_1, rome_netGuids_1, rom_netPciNumaIds_1);
|
||||
NetDevices nic_2_rome(2, rome_netPciPaths_2, rome_netGuids_2, rom_netPciNumaIds_2);
|
||||
|
||||
// 8 GPUs PCIe 1 NIC
|
||||
NodeModel model_8pcie_1nic(skylake, vg20_pcie, nic_1, "Skylake 8 GPUs PCIe");
|
||||
|
||||
// 8 GPUs PCIe 2 NIC
|
||||
NodeModel model_8pcie_2nic(skylake, vg20_pcie, nic_2, "Skylake 8 GPUs PCIe 2 NIC");
|
||||
|
||||
// VEGA20 4P1H, use VEGA20 4P2H model
|
||||
NodeModel model_vg20_4p1h_1nic(skylake, vg20_4p1h, nic_1, "Skylake VEGA20 4P1H");
|
||||
|
||||
// VEGA20 GPUs XGMI 4P2H 1 NIC
|
||||
NodeModel model_vg20_4p2h_1nic(skylake, vg20_4p2h, nic_1_1, "Skylake VEGA20 4P2H");
|
||||
|
||||
// gfx908 4P3L
|
||||
NodeModel model_gfx908_4p_1nic(skylake, gfx908_4p3l, nic_1, "Skylake gfx908 4P3L");
|
||||
|
||||
// gfx908 8P6L
|
||||
NodeModel model_gfx908_8p_1nic(skylake, gfx908_8p6l, nic_1, "Skylake gfx908 8P6L");
|
||||
|
||||
// gfx908 8P6L alternative connection
|
||||
NodeModel model_gfx908_8p_1nic_1(skylake, gfx908_8p6l_1, nic_1, "Skylake gfx908 8P6L Alt. Connection");
|
||||
|
||||
// 8 GPUs PCIe on Rome
|
||||
NodeModel model_8pcie_1nic_rome(rome, vg20_pcie_rome, nic_1_rome, "Rome 8 GPUs PCIe");
|
||||
|
||||
// 8 GPUs PCIe 2 NICs on Rome
|
||||
NodeModel model_8pcie_2nic_rome(rome, vg20_pcie_rome, nic_2_rome, "Rome 8 GPUs PCIe 2 NICs");
|
||||
|
||||
NetworkModel network;
|
||||
|
||||
switch(model_id) {
|
||||
case 0:
|
||||
for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_1nic);
|
||||
break;
|
||||
case 1:
|
||||
for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_2nic);
|
||||
break;
|
||||
case 2:
|
||||
for (int i = 0; i < 2; i ++) network.AddNode(model_vg20_4p1h_1nic);
|
||||
break;
|
||||
case 3:
|
||||
for (int i = 0; i < 4; i ++) network.AddNode(model_vg20_4p2h_1nic);
|
||||
break;
|
||||
case 4:
|
||||
network.AddNode(model_gfx908_4p_1nic);
|
||||
break;
|
||||
case 5:
|
||||
network.AddNode(model_gfx908_8p_1nic);
|
||||
break;
|
||||
case 6:
|
||||
network.AddNode(model_gfx908_8p_1nic_1);
|
||||
break;
|
||||
case 7:
|
||||
network.AddNode(model_8pcie_1nic_rome);
|
||||
break;
|
||||
case 8:
|
||||
for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_2nic_rome);
|
||||
break;
|
||||
case 9:
|
||||
for (int i = 0; i < 3; i ++) network.AddNode(model_8pcie_1nic);
|
||||
network.AddNode(model_8pcie_1nic_rome);
|
||||
for (int i = 0; i < 2; i ++) network.AddNode(model_gfx908_4p_1nic);
|
||||
break;
|
||||
default:
|
||||
printf("Invalid model_id %d\n", model_id);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
printf("Generating topology using %d: %s\n", model_id, model_descriptions[model_id]);
|
||||
|
||||
int nranks = network.GetNRanks();
|
||||
int nnodes = network.GetNNodes();
|
||||
|
||||
printf("nnodes = %d, nranks = %d\n", nnodes, nranks);
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
node_model = network.GetNode(i);
|
||||
assert(node_model!=0);
|
||||
printf("Rank %d: node %d (%s) GPU busId %lx\n", i, node_model->nodeId,
|
||||
node_model->description, node_model->getGpuBusId(node_model->rankToCudaDev(i)));
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclCalloc(&comm, nranks));
|
||||
|
||||
struct allGather1Data_t *allGather1Data;
|
||||
NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
|
||||
|
||||
struct allGather3Data_t *allGather3Data;
|
||||
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
|
||||
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
comm[i].rank = i;
|
||||
comm[i].nRanks = nranks;
|
||||
node_model = network.GetNode(i);
|
||||
assert(node_model!=0);
|
||||
bootstrapAllGather(&comm[i], allGather1Data);
|
||||
}
|
||||
|
||||
struct ncclTopoGraph treeGraph, ringGraph;
|
||||
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
node_model = network.GetNode(i);
|
||||
assert(node_model!=0);
|
||||
initTransportsRank_1(&comm[i], allGather1Data, allGather3Data, treeGraph, ringGraph);
|
||||
}
|
||||
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
node_model = network.GetNode(i);
|
||||
assert(node_model!=0);
|
||||
initTransportsRank_3(&comm[i], allGather3Data, treeGraph, ringGraph);
|
||||
}
|
||||
|
||||
free(allGather3Data);
|
||||
free(allGather1Data);
|
||||
|
||||
free(comm);
|
||||
printf("Done generating topology using %d: %s\n", model_id, model_descriptions[model_id]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,420 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "nccl.h"
|
||||
#include "channel.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "bootstrap.h"
|
||||
#include "transport.h"
|
||||
#include "group.h"
|
||||
#include "net.h"
|
||||
#include "graph.h"
|
||||
#include "argcheck.h"
|
||||
#include <sched.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
#include <dlfcn.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include "model.h"
|
||||
#include "utils.h"
|
||||
|
||||
extern NodeModel *node_model;
|
||||
|
||||
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
|
||||
|
||||
// Get current Compute Capability
|
||||
int ncclCudaCompCap() {
|
||||
int ccMajor = 1, ccMinor = 0;
|
||||
return ccMajor*10+ccMinor;
|
||||
}
|
||||
|
||||
ncclResult_t int64ToBusId(int64_t id, char* busId) {
|
||||
sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t busIdToInt64(char* busId, int64_t* id) {
|
||||
const int size = strlen(busId);
|
||||
char* hexStr;
|
||||
NCCLCHECK(ncclCalloc(&hexStr, size));
|
||||
int hexOffset = 0;
|
||||
for (int i=0; i<size; i++) {
|
||||
char c = busId[i];
|
||||
if (c == '.' || c == ':') continue;
|
||||
if ((c >= '0' && c <= '9') ||
|
||||
(c >= 'A' && c <= 'F') ||
|
||||
(c >= 'a' && c <= 'f')) {
|
||||
hexStr[hexOffset++] = busId[i];
|
||||
} else break;
|
||||
}
|
||||
hexStr[hexOffset] = '\0';
|
||||
*id = strtol(hexStr, NULL, 16);
|
||||
free(hexStr);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int ncclDebugLevel = -1;
|
||||
|
||||
void ncclDebugInit() {
|
||||
if (ncclDebugLevel != -1) return;
|
||||
const char* nccl_debug = getenv("NCCL_DEBUG");
|
||||
if (nccl_debug == NULL) {
|
||||
ncclDebugLevel = NCCL_LOG_NONE;
|
||||
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_VERSION;
|
||||
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_WARN;
|
||||
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_INFO;
|
||||
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_ABORT;
|
||||
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_TRACE;
|
||||
}
|
||||
}
|
||||
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
|
||||
if (ncclDebugLevel == -1) ncclDebugInit();
|
||||
if (level == NCCL_LOG_TRACE && ncclDebugLevel != NCCL_LOG_TRACE) return;
|
||||
char buffer[1024];
|
||||
size_t len;
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"[%d:%d] ", node_model->nodeId, node_model->currRank);
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vsprintf(buffer+len, fmt, args);
|
||||
va_end(args);
|
||||
printf("%s\n", buffer);
|
||||
if (level == NCCL_LOG_WARN) {
|
||||
fprintf(stderr,"[%d:%d] %s:%d TOPO EXPL ABORT\n",
|
||||
node_model->nodeId, node_model->currRank, filefunc, line);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data) {
|
||||
// AllGather1 - begin
|
||||
allGather1Data[comm->rank].peerInfo.rank = comm->rank;
|
||||
allGather1Data[comm->rank].peerInfo.cudaDev = node_model->rankToCudaDev(comm->rank);
|
||||
allGather1Data[comm->rank].peerInfo.gdrSupport = 1;
|
||||
allGather1Data[comm->rank].peerInfo.hostHash = node_model->hostHash;
|
||||
allGather1Data[comm->rank].peerInfo.pidHash = node_model->pidHash;
|
||||
allGather1Data[comm->rank].peerInfo.shmDev = 0x19;
|
||||
allGather1Data[comm->rank].peerInfo.busId = node_model->getGpuBusId(node_model->rankToCudaDev(comm->rank));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data,
|
||||
struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph) {
|
||||
// We use 3 AllGathers
|
||||
// 1. { peerInfo, comm }
|
||||
// 2. ConnectTransport[nranks], ConnectValue[nranks]
|
||||
// 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
|
||||
|
||||
int rank = comm->rank;
|
||||
int nranks = comm->nRanks;
|
||||
//uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
|
||||
//TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
|
||||
//NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
|
||||
|
||||
// AllGather1 - begin
|
||||
//struct allGather1Data_t *allGather1Data;
|
||||
//NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
|
||||
//allGather1Data[rank].comm = comm;
|
||||
struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
|
||||
//NCCLCHECK(fillInfo(comm, myInfo, commHash));
|
||||
//NCCLCHECK(bootstrapAllGather(comm, allGather1Data));
|
||||
|
||||
NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
|
||||
if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
|
||||
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
}
|
||||
// AllGather1 data is used again below
|
||||
// AllGather1 - end
|
||||
|
||||
// Topo detection / System graph creation
|
||||
NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
|
||||
// Compute paths between GPUs and NICs
|
||||
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
|
||||
// Remove inaccessible GPUs and unused NICs
|
||||
NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
|
||||
// Recompute paths after trimming
|
||||
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
|
||||
// Compute max speed to accelerate search
|
||||
NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo));
|
||||
// Print final topology
|
||||
NCCLCHECK(ncclTopoPrint(comm->topo));
|
||||
|
||||
// Get rings and trees
|
||||
//struct ncclTopoGraph treeGraph;
|
||||
treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
|
||||
treeGraph.crossNic = ncclParamCrossNic();
|
||||
// We communicate only half the data between node with trees on 2 nodes.
|
||||
NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
|
||||
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
|
||||
//struct ncclTopoGraph ringGraph;
|
||||
ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
|
||||
ringGraph.crossNic = ncclParamCrossNic();
|
||||
NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph));
|
||||
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph));
|
||||
|
||||
// AllGather3 - begin
|
||||
allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
|
||||
allGather3Data[rank].nvlink = treeGraph.nvlink;
|
||||
allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
|
||||
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
|
||||
allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
|
||||
allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
|
||||
allGather3Data[rank].tree.nvlink = treeGraph.nvlink;
|
||||
allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
|
||||
allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
|
||||
allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
|
||||
allGather3Data[rank].ring.nvlink = ringGraph.nvlink;
|
||||
|
||||
NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks));
|
||||
//INFO(NCCL_GRAPH, "%d: nvlink %d nChannels %d tree.sameChannels %d tree.speedIntra %d tree.speedInter %d tree.nvlink %d ring.sameChannels %d ring.speedIntra %d ring.speedInter %d ring.nvlink %d",
|
||||
// rank, allGather3Data[rank].nvlink, allGather3Data[rank].nChannels, allGather3Data[rank].tree.sameChannels, allGather3Data[rank].tree.speedIntra, allGather3Data[rank].tree.speedInter, allGather3Data[rank].tree.nvlink,
|
||||
// allGather3Data[rank].ring.sameChannels, allGather3Data[rank].ring.speedIntra, allGather3Data[rank].ring.speedInter, allGather3Data[rank].ring.nvlink);
|
||||
//INFO(NCCL_GRAPH, " ringRecv %d ringSend %d ringPrev %d ringNext %d treeUpRecv %d treeUpSend %d treeDnRecv %d treeDnSend %d",
|
||||
// allGather3Data[rank].topoRanks.ringRecv[0], allGather3Data[rank].topoRanks.ringSend[0], allGather3Data[rank].topoRanks.ringPrev[0], allGather3Data[rank].topoRanks.ringNext[0],
|
||||
// allGather3Data[rank].topoRanks.treeUpRecv[0], allGather3Data[rank].topoRanks.treeUpSend[0], allGather3Data[rank].topoRanks.treeDnRecv[0], allGather3Data[rank].topoRanks.treeDnSend[0]);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <int type>
|
||||
static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
|
||||
for (int t=0; t<NTRANSPORTS; t++) {
|
||||
struct ncclTransport *transport = ncclTransports+t;
|
||||
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
|
||||
int ret = 0;
|
||||
NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
|
||||
if (ret) {
|
||||
//cpu_set_t affinitySave;
|
||||
//sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
||||
//int cudaDev;
|
||||
//CUDACHECK(hipGetDevice(&cudaDev));
|
||||
//setCpuAffinity(cudaDev);
|
||||
connector->transportComm = transportComm;
|
||||
NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId));
|
||||
//sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("No transport found !");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
|
||||
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
|
||||
uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
|
||||
struct ncclConnect connect;
|
||||
struct ncclConnector* conn;
|
||||
for (int i=0; i<nrecv; i++) {
|
||||
int peer = peerRecv[i];
|
||||
if (peer == -1) continue;
|
||||
conn = &channel->peers[peer].recv;
|
||||
if (conn->connected) { ++nSkippedRecv; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
|
||||
//NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
}
|
||||
for (int i=0; i<nsend; i++) {
|
||||
int peer = peerSend[i];
|
||||
if (peer == -1) continue;
|
||||
conn = &channel->peers[peer].send;
|
||||
if (conn->connected) { ++nSkippedSend; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
|
||||
//NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
}
|
||||
for (int i=0; i<nsend; i++) {
|
||||
int peer = peerSend[i];
|
||||
if (peer == -1) continue;
|
||||
conn = &channel->peers[peer].send;
|
||||
if (conn->connected) {++nSkippedSend; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
//NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
//NCCLCHECK(conn->transportComm->connect(&connect, conn));
|
||||
conn->connected = 1;
|
||||
}
|
||||
for (int i=0; i<nrecv; i++) {
|
||||
int peer = peerRecv[i];
|
||||
if (peer == -1) continue;
|
||||
conn = &channel->peers[peer].recv;
|
||||
if (conn->connected) {++nSkippedRecv; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
//CCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
//NCCLCHECK(conn->transportComm->connect(&connect, conn));
|
||||
conn->connected = 1;
|
||||
}
|
||||
TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
|
||||
struct ncclChannel* channel = comm->channels+channelid;
|
||||
channel->id = channelid;
|
||||
|
||||
// Setup intermediate buffering
|
||||
//channel->buffSize = ncclParamBuffsize();
|
||||
|
||||
// Ring index to user rank table.
|
||||
//NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
|
||||
|
||||
// Communication structures with peers.
|
||||
//NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
|
||||
for (size_t i=0; i<comm->nRanks; ++i) {
|
||||
channel->peers[i].send.comm = comm;
|
||||
channel->peers[i].recv.comm = comm;
|
||||
}
|
||||
|
||||
// Per-channel operation list.
|
||||
//NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) {
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
|
||||
NCCLCHECK(initChannel(comm, channelId));
|
||||
|
||||
struct ncclRing* ring = &comm->channels[channelId].ring;
|
||||
// Reorganize ranks to start with rank.
|
||||
int shift;
|
||||
for (shift = 0; shift<nranks; shift++) {
|
||||
if (ringRanks[shift] == rank) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (int i=0; i<nranks; i++) {
|
||||
ring->userRanks[i] = ringRanks[(i+shift)%nranks];
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph,
|
||||
struct ncclTopoGraph& ringGraph) {
|
||||
int rank = comm->rank;
|
||||
int nranks = comm->nRanks;
|
||||
//NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
|
||||
|
||||
// Determine nNodes, firstRanks, ...
|
||||
int* nodesFirstRank;
|
||||
NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
|
||||
for (int i=0; i<nranks; i++) {
|
||||
int node = -1;
|
||||
int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
|
||||
for (int n=0; n<comm->nNodes; n++) {
|
||||
if (nodesFirstRank[n] == firstRank) node = n;
|
||||
}
|
||||
if (node == -1) {
|
||||
node = comm->nNodes++;
|
||||
nodesFirstRank[node] = firstRank;
|
||||
}
|
||||
if (i == comm->rank) comm->node = node;
|
||||
}
|
||||
|
||||
// Determine the minimum CUDA Compute capability of all GPUs
|
||||
int myCompCap = allGather3Data[rank].cudaCompCap;
|
||||
int minCompCap = myCompCap, maxCompCap = myCompCap;
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
|
||||
maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
|
||||
}
|
||||
|
||||
comm->nvlink = 1;
|
||||
for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink;
|
||||
|
||||
int nChannelsOrig = comm->nChannels;
|
||||
struct ncclTopoRanks** allTopoRanks;
|
||||
NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
|
||||
for (int i=0; i<nranks; i++) {
|
||||
allTopoRanks[i] = &allGather3Data[i].topoRanks;
|
||||
// Make sure we align all ranks so that the tuning is consistent across ranks
|
||||
treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
|
||||
treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
|
||||
treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
|
||||
treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
|
||||
treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink);
|
||||
ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
|
||||
ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
|
||||
ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
|
||||
ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink);
|
||||
}
|
||||
|
||||
if (comm->nChannels < nChannelsOrig) {
|
||||
// We started duplicating channels during Preset(), so we need to move the
|
||||
// duplicated channels since we have removed some.
|
||||
for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
|
||||
}
|
||||
|
||||
int *rings;
|
||||
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
|
||||
|
||||
char line[1024];
|
||||
sprintf(line, "nodesFirstRank: ");
|
||||
int offset = strlen(line);
|
||||
for (int i=0; i<comm->nNodes; i++) {
|
||||
sprintf(line+offset, "%d ", nodesFirstRank[i]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
INFO(NCCL_INIT, "%s", line);
|
||||
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
|
||||
|
||||
free(allTopoRanks);
|
||||
free(nodesFirstRank);
|
||||
|
||||
// AllGather3 - end
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
|
||||
|
||||
line[0]='\0';
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclTree* treeUp = &comm->channels[c].treeUp;
|
||||
struct ncclTree* treeDn = &comm->channels[c].treeDn;
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d",
|
||||
c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up,
|
||||
treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]);
|
||||
}
|
||||
line[1023] = '\0';
|
||||
INFO(NCCL_INIT, "Trees%s", line);
|
||||
|
||||
free(rings);
|
||||
|
||||
// Done with AllGather1 data
|
||||
//free(allGather1Data);
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
||||
|
||||
// Connect with prev/next for each ring
|
||||
struct ncclConnect *connect;
|
||||
NCCLCHECK(ncclCalloc(&connect, 2));
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks));
|
||||
if (comm->nRanks == 1) continue;
|
||||
NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
|
||||
NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up));
|
||||
NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down));
|
||||
}
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
|
||||
free(connect);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
Ссылка в новой задаче
Block a user