Merge pull request #172 from wenkaidu/topo_expl

Add topology explorer

[ROCm/rccl commit: 5b3856f2ed]
Этот коммит содержится в:
Wenkai Du
2020-02-20 15:16:55 -08:00
коммит произвёл GitHub
родитель 9dad3e0a90 00f421ccbd
Коммит cf4bce4ad3
8 изменённых файлов: 1650 добавлений и 0 удалений
+34
Просмотреть файл
@@ -30,9 +30,15 @@ const char* topoLinkTypeStr[] = { "LOC", "XGMI", "PCI", "QPI", "NET" };
const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
#endif
#ifdef TOPO_EXPL
#include "model.h"
extern NodeModel *node_model;
#endif
/******************************************************************/
/******************* Graph Creation Functions *********************/
/******************************************************************/
#ifndef TOPO_EXPL
static int getNumaId(char *path) {
char npath[PATH_MAX];
snprintf(npath, PATH_MAX, "%s/numa_node", path);
@@ -59,6 +65,15 @@ static ncclResult_t getPciPath(char* busId, char** path) {
}
return ncclSuccess;
}
#else
static int getNumaId(char *path) {
return node_model->getNumaId(path);
}
static ncclResult_t getPciPath(char* busId, char** path) {
return node_model->getGpuPciPath(busId, path);
}
#endif
// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
@@ -102,6 +117,7 @@ int interCpuWidth = 0;
int cpuPciWidth = 0;
int p2pPciWidth = 0;
#ifndef TOPO_EXPL
static ncclResult_t getCpuWidths() {
// Check if already detected
if (interCpuWidth + cpuPciWidth + p2pPciWidth) return ncclSuccess;
@@ -182,6 +198,14 @@ static ncclResult_t getCpuWidths() {
INFO(NCCL_GRAPH, "%s CPU (CPU-PCI %d, PCI/P2P %d, InterCpu %d)", cpu, cpuPciWidth, p2pPciWidth, interCpuWidth);
return ncclSuccess;
}
#else
static ncclResult_t getCpuWidths() {
char cpu[256];
node_model->getCpuWidths(cpu, &interCpuWidth, &cpuPciWidth, &p2pPciWidth);
TRACE(NCCL_GRAPH, "%s CPU (CPU-PCI %d, PCI/P2P %d, InterCpu %d)", cpu, cpuPciWidth, p2pPciWidth, interCpuWidth);
return ncclSuccess;
}
#endif
static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
NCCLCHECK(getCpuWidths());
@@ -272,7 +296,11 @@ ncclResult_t ncclTopoConnectXGMI(struct ncclComm* comm, struct ncclTopoSystem* s
uint32_t link_type, hops;
int cudaDev1 = busIdToCudaDev(comm->peerInfo[gpu1->rank].busId);
int cudaDev2 = busIdToCudaDev(comm->peerInfo[gpu2->rank].busId);
#ifndef TOPO_EXPL
if (hipExtGetLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) == hipSuccess) {
#else
if (node_model->getLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) == hipSuccess) {
#endif
if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1) {
NCCLCHECK(ncclTopoConnectNodes(gpu1, gpu2, LINK_NVL, minWidth));
}
@@ -424,6 +452,7 @@ ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTop
// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
#include <glob.h>
#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
#ifndef TOPO_EXPL
uint64_t getIbGuid(char* path) {
uint64_t guid = 0ULL;
char guidPath[PATH_MAX];
@@ -446,6 +475,11 @@ uint64_t getIbGuid(char* path) {
}
return guid;
}
#else
uint64_t getIbGuid(char* path) {
return node_model->getIbGuid(path);
}
#endif
struct netInfo {
char* path;
+19
Просмотреть файл
@@ -0,0 +1,19 @@
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
HIP_PATH ?= $(wildcard /opt/rocm/hip)
ifeq (,$(HIP_PATH))
HIP_PATH = ../../..
endif
HIPCC = $(HIP_PATH)/bin/hipcc
EXE = topo_expl
CXXFLAGS = -g -O3 -Iinclude -I../../src/include -I../../src/graph/ -DTOPO_EXPL -DENABLE_TRACE
files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc ../../src/graph/search.cc ../../src/graph/connect.cc
all: $(EXE)
$(EXE): $(files)
$(HIPCC) $(CXXFLAGS) $^ -o $@
clean:
rm -f *.o $(EXE)
+419
Просмотреть файл
@@ -0,0 +1,419 @@
/*
Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef MODEL_H_
#define MODEL_H_
class CpuDevices {
private:
char *cpuName;
int interCpuWidth;
int cpuPciWidth;
int p2pPciWidth;
public:
CpuDevices(const char *cpuname, const int intercpuwidth, const int cpupciwidth, const int p2ppciwidth) :
cpuName((char *)cpuname), interCpuWidth(intercpuwidth), cpuPciWidth(cpupciwidth), p2pPciWidth(p2ppciwidth) {}
CpuDevices() : cpuName(0), interCpuWidth(0), cpuPciWidth(0), p2pPciWidth(0) {}
ncclResult_t getCpuWidths(char* name, int* interCpu, int* cpuPci, int* p2pPci) {
strcpy(name, cpuName);
*interCpu = interCpuWidth;
*cpuPci = cpuPciWidth;
*p2pPci = p2pPciWidth;
return ncclSuccess;
}
};
class GpuDevices {
private:
int nGpus;
uint64_t *busIds;
char **gpuPciPaths;
int *gpuNumaIds;
int *connMatrix;
public:
GpuDevices(const int ngpus, const uint64_t *busids, const char **gpupcipaths, const int *gpunumaids, const int *connmatrix) :
nGpus(ngpus), busIds((uint64_t *)busids), gpuPciPaths((char **)gpupcipaths), gpuNumaIds((int *)gpunumaids), connMatrix((int *)connmatrix) {}
GpuDevices () : nGpus(0), busIds(0), gpuPciPaths(0), gpuNumaIds(0), connMatrix(0) {}
int getnDevs() { return nGpus; }
uint64_t getBusId(int dev) { return busIds[dev]; }
ncclResult_t getPciPath(char* busId, char** path) {
char tempBusId[] = "0000:00:00.0";
*path = (char *)malloc(PATH_MAX);
int i;
for (i = 0; i < nGpus; i++) {
NCCLCHECK(int64ToBusId(busIds[i], tempBusId));
if (strcmp(busId, tempBusId) == 0)
break;
}
if (i < nGpus)
strcpy(*path, gpuPciPaths[i]);
else {
WARN("Could not find real path of %s", busId);
return ncclSystemError;
}
return ncclSuccess;
}
int p2pCanConnect(int device1, int device2) {
// connection matrix are 8 GPUs
int dist = connMatrix[device1*8+device2];
if (dist == 255)
return 0;
//if (dist%15 == 0 && dist/15 != 1) {
// return 0;
//}
return 1;
};
hipError_t getLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) {
// connection matrix are 8 GPUs
int dist = connMatrix[device1*8+device2];
if (dist%15 == 0) {
*linktype = 4;
*hopcount = dist/15;
}
else if (dist%20 == 0) {
*linktype = 2;
*hopcount = dist/20;
}
else if (dist%36 == 0) {
*linktype = 1;
*hopcount = dist/36;
}
return hipSuccess;
}
virtual int getNumaId(char *path) {
int n;
// search for all GPUs
for (n = 0; n < nGpus; n++)
if (strcmp(path, gpuPciPaths[n]) == 0)
break;
if (n < nGpus)
return gpuNumaIds[n];
return -1;
}
};
class NetDevices {
private:
int nNetDevs;
char **netPciPaths;
uint64_t *netGuids; // IB ports on same card share the same GUID
int *netNumaIds;
public:
NetDevices(const int nnetdevs, const char **netpcipaths, const uint64_t *netguids, const int *netnumaids) :
nNetDevs(nnetdevs), netPciPaths((char **)netpcipaths), netGuids((uint64_t *)netguids), netNumaIds((int *)netnumaids) {}
NetDevices() : nNetDevs(0), netPciPaths(0), netGuids(0), netNumaIds(0) {}
int getnDevs() { return nNetDevs; }
ncclResult_t getPciPath(int dev, char** path) {
*path = (char *)malloc(PATH_MAX);
if (dev < nNetDevs)
strcpy(*path, netPciPaths[dev]);
else {
WARN("Could not find real path of %d", dev);
return ncclSystemError;
}
return ncclSuccess;
}
virtual int getNumaId(char *path) {
int n;
// search for all NICs
for (n = 0; n < nNetDevs; n++)
if (strcmp(path, netPciPaths[n]) == 0)
break;
if (n < nNetDevs)
return netNumaIds[n];
return -1;
}
uint64_t getIbGuid(char* path) {
int n;
for (n = 0; n < nNetDevs; n++)
if (strcmp(path, netPciPaths[n]) == 0)
break;
if (n < nNetDevs)
return netGuids[n];
WARN("Invalid IB path %s", path);
return 0;
}
};
class NodeModel {
private:
CpuDevices cpus;
GpuDevices gpus;
NetDevices netdevs;
public:
int nodeId;
int currRank;
int firstRank;
uint64_t hostHash; // auto-generated
uint64_t pidHash; // auto-generated
char description[256];
int rankToCudaDev(int rank) { return rank - firstRank; }
int getnGpus() { return gpus.getnDevs(); }
int getnNetDevs() { return netdevs.getnDevs(); }
ncclResult_t getGpuPciPath(char* busId, char** path) {
return gpus.getPciPath(busId, path);
}
ncclResult_t getNetPciPath(int dev, char** path) {
return netdevs.getPciPath(dev, path);
}
uint64_t getGpuBusId(int dev) {
return gpus.getBusId(dev);
}
int p2pCanConnect(int device1, int device2) { return gpus.p2pCanConnect(device1, device2); }
hipError_t getLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) {
return gpus.getLinkTypeAndHopCount(device1, device2, linktype, hopcount);
}
uint64_t getIbGuid(char* path) {
return netdevs.getIbGuid(path);
}
int shmCanConnect(int device1, int device2) { return 1; }
int netCanConnect(int device1, int device2) { return 1; }
virtual int getNumaId(char *path) {
int numa = gpus.getNumaId(path);
if (numa != -1) return numa;
numa = netdevs.getNumaId(path);
if (numa != -1) return numa;
WARN("Invalid path %s for getNumaId", path);
return 0;
}
virtual ncclResult_t getCpuWidths(char* name, int* interCpu, int* cpuPci, int* p2pPci) {
return cpus.getCpuWidths(name, interCpu, cpuPci, p2pPci);
}
NodeModel(CpuDevices cpu, GpuDevices gpu, NetDevices net, const char *desc) :
cpus(cpu), gpus(gpu), netdevs(net) {
strncpy(description, desc, 256);
}
NodeModel() {}
~NodeModel() {}
};
class NetworkModel {
private:
int nNodes;
int nRanks;
NodeModel nodes[NCCL_TOPO_MAX_NODES];
public:
void AddNode(NodeModel node) {
nodes[nNodes] = node;
nodes[nNodes].nodeId = nNodes;
nodes[nNodes].firstRank = nRanks;
nodes[nNodes].hostHash = ((uint64_t)rand() << 32) | rand();
nodes[nNodes].pidHash = ((uint64_t)rand() << 32) | rand();
nNodes++;
nRanks += node.getnGpus();
}
int GetNNodes() { return nNodes; }
int GetNRanks() { return nRanks; }
NodeModel* GetNode(int rank) {
int node_id;
if(rank < 0 || rank >= nRanks)
return 0;
for(node_id = nNodes-1; node_id >= 0; node_id--)
if(rank >= nodes[node_id].firstRank) break;
if (node_id >= 0) {
nodes[node_id].currRank = rank;
return nodes+node_id;
}
else
return 0;
}
NetworkModel() : nNodes(0), nRanks(0) {}
};
const static uint64_t busIds_8[] = { 0x1d000, 0x20000, 0x23000, 0x26000, 0x3f000, 0x43000, 0x46000, 0x49000 };
const static char* gpuPciPaths_8[] = {
"/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:08.0/0000:1b:00.0/0000:1c:00.0/0000:1d:00.0",
"/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:0c.0/0000:1e:00.0/0000:1f:00.0/0000:20:00.0",
"/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:10.0/0000:21:00.0/0000:22:00.0/0000:23:00.0",
"/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:14.0/0000:24:00.0/0000:25:00.0/0000:26:00.0",
"/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:04.0/0000:3d:00.0/0000:3e:00.0/0000:3f:00.0",
"/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:0c.0/0000:41:00.0/0000:42:00.0/0000:43:00.0",
"/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:10.0/0000:44:00.0/0000:45:00.0/0000:46:00.0",
"/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:14.0/0000:47:00.0/0000:48:00.0/0000:49:00.0",
};
const static int gpuPciNumaIds_8[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
const static char* netPciPaths_1[] = {
"/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:04.0/0000:1a:00.0",
};
const static char* netPciPaths_1_1[] = {
"/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:08.0/0000:4c:00.0",
};
const static uint64_t netGuids_1[] = {
0xb8599f030007053aL,
};
const static int netPciNumaIds_1[] = { 0 };
const static char* netPciPaths_2[] = {
"/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:04.0/0000:1a:00.0",
"/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:08.0/0000:4c:00.0",
};
const static uint64_t netGuids_2[] = {
0xb8599f030007053aL,
0x506b4b030027bbf2L,
};
const static int netPciNumaIds_2[] = { 0, 0 };
const static uint64_t rome_busIds_8[] = { 0x63000, 0x23000, 0x26000, 0x03000, 0xe3000, 0xc3000, 0xc6000, 0xa3000 };
const static char* rome_gpuPciPaths_8[] = {
"/sys/devices/pci0000:60/0000:60:03.1/0000:61:00.0/0000:62:00.0/0000:63:00.0",
"/sys/devices/pci0000:20/0000:20:01.1/0000:21:00.0/0000:22:00.0/0000:23:00.0",
"/sys/devices/pci0000:20/0000:20:03.1/0000:24:00.0/0000:25:00.0/0000:26:00.0",
"/sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0",
"/sys/devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/0000:e2:00.0/0000:e3:00.0",
"/sys/devices/pci0000:c0/0000:c0:01.1/0000:c1:00.0/0000:c2:00.0/0000:c3:00.0",
"/sys/devices/pci0000:c0/0000:c0:03.1/0000:c4:00.0/0000:c5:00.0/0000:c6:00.0",
"/sys/devices/pci0000:a0/0000:a0:03.1/0000:a1:00.0/0000:a2:00.0/0000:a3:00.0",
};
const static int rome_gpuPciNumaIds_8[] = { 0, 0, 0, 0, 4, 4, 4, 4 };
const static char* rome_netPciPaths_1[] = {
"/sys/devices/pci0000:40/0000:40:01.1/0000:41:00.0",
};
const static uint64_t rome_netGuids_1[] = {
0xb8599f030007053aL,
};
const static int rom_netPciNumaIds_1[] = { 0 };
const static char* rome_netPciPaths_2[] = {
"/sys/devices/pci0000:40/0000:40:01.1/0000:41:00.0",
"/sys/devices/pci0000:80/0000:80:01.1/0000:81:00.0",
};
const static uint64_t rome_netGuids_2[] = {
0xb8599f030007053aL,
0x506b4b030027bbf2L,
};
const static int rom_netPciNumaIds_2[] = { 0, 4 };
const int conn_mat_pcie[64] = {
0 , 40, 40, 40, 40, 40, 40, 40,
40, 0 , 40, 40, 40, 40, 40, 40,
40, 40, 0 , 40, 40, 40, 40, 40,
40, 40, 40, 0 , 40, 40, 40, 40,
40, 40, 40, 40, 0 , 40, 40, 40,
40, 40, 40, 40, 40, 0 , 40, 40,
40, 40, 40, 40, 40, 40, 0 , 40,
40, 40, 40, 40, 40, 40, 40, 0 ,
};
const int conn_mat_4p2h[64] = {
0 , 15, 15, 30, 40, 40, 40, 40,
15, 0 , 30, 15, 40, 40, 40, 40,
15, 30, 0 , 15, 40, 40, 40, 40,
30, 15, 15, 0 , 40, 40, 40, 40,
40, 40, 40, 40, 0 , 15, 15, 30,
40, 40, 40, 40, 15, 0 , 30, 15,
40, 40, 40, 40, 15, 30, 0 , 15,
40, 40, 40, 40, 30, 15, 15, 0 ,
};
const int conn_mat_8p6l[64] = {
0 , 15, 15, 15, 15, 30, 15, 15,
15, 0 , 15, 15, 30, 15, 15, 15,
15, 15, 0 , 15, 15, 15, 15, 30,
15, 15, 15, 0 , 15, 15, 30, 15,
15, 30, 15, 15, 0 , 15, 15, 15,
30, 15, 15, 15, 15, 0 , 15, 15,
15, 15, 15, 30, 15, 15, 0 , 15,
15, 15, 30, 15, 15, 15, 15, 0 ,
};
const int conn_mat_8p6l_1[64] = {
0 , 15, 15, 30, 15, 15, 15, 15,
15, 0 , 30, 15, 15, 15, 15, 15,
15, 30, 0 , 15, 15, 15, 15, 15,
30, 15, 15, 0 , 15, 15, 15, 15,
15, 15, 15, 15, 0 , 15, 15, 30,
15, 15, 15, 15, 15, 0 , 30, 15,
15, 15, 15, 15, 15, 30, 0 , 15,
15, 15, 15, 15, 30, 15, 15, 0 ,
};
const int conn_mat_rome[64] = {
0 , 40, 40, 40, 72, 72, 72, 72,
40, 0 , 40, 40, 72, 72, 72, 72,
40, 40, 0 , 40, 72, 72, 72, 72,
40, 40, 40, 0 , 72, 72, 72, 72,
72, 72, 72, 72, 0 , 40, 40, 40,
72, 72, 72, 72, 40, 0 , 40, 40,
72, 72, 72, 72, 40, 40, 0 , 40,
72, 72, 72, 72, 40, 40, 40, 0 ,
};
#endif
+267
Просмотреть файл
@@ -0,0 +1,267 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_H_
#define NCCL_H_
#include <hip/hip_runtime_api.h>
#include <hip/hip_fp16.h>
#define NCCL_MAJOR 2
#define NCCL_MINOR 5
#define NCCL_PATCH 6
#define NCCL_SUFFIX ""
#define NCCL_VERSION_CODE 2506
#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
#define RCCL_BFLOAT16 1
#ifdef __cplusplus
extern "C" {
#endif
/* Opaque handle to communicator */
typedef struct ncclComm* ncclComm_t;
#define NCCL_UNIQUE_ID_BYTES 128
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
/* Error type */
typedef enum { ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclNumResults = 6 } ncclResult_t;
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
* This integer is coded with the MAJOR, MINOR and PATCH level of the
* NCCL library
*/
ncclResult_t ncclGetVersion(int *version);
ncclResult_t pncclGetVersion(int *version);
/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
* called once and the Id should be distributed to all ranks in the
* communicator before calling ncclCommInitRank. */
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
/* Creates a new communicator (multi thread/process version).
* rank must be between 0 and nranks-1 and unique within a communicator clique.
* Each rank is associated to a CUDA device, which has to be set before calling
* ncclCommInitRank.
* ncclCommInitRank implicitly syncronizes with other ranks, so it must be
* called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
/* Creates a clique of communicators (single process version).
* This is a convenience function to create a single-process communicator clique.
* Returns an array of ndev newly initialized communicators in comm.
* comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
* If devlist is NULL, the first ndev CUDA devices are used.
* Order of devlist defines user-order of processors within the communicator. */
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
/* Frees resources associated with communicator object, but waits for any operations
* that might still be running on the device. */
ncclResult_t ncclCommDestroy(ncclComm_t comm);
ncclResult_t pncclCommDestroy(ncclComm_t comm);
/* Frees resources associated with communicator object and aborts any operations
* that might still be running on the device. */
ncclResult_t ncclCommAbort(ncclComm_t comm);
ncclResult_t pncclCommAbort(ncclComm_t comm);
/* Returns a human-readable error message. */
const char* ncclGetErrorString(ncclResult_t result);
const char* pncclGetErrorString(ncclResult_t result);
/* Checks whether the comm has encountered any asynchronous errors */
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
/* Gets the number of ranks in the communicator clique. */
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
/* Returns the cuda device number associated with the communicator. */
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
/* Returns the user-ordered "rank" associated with the communicator. */
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
/* Reduction operation selector */
typedef enum { ncclSum = 0,
ncclProd = 1,
ncclMax = 2,
ncclMin = 3,
ncclNumOps = 4 } ncclRedOp_t;
/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2, ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6, ncclHalf = 6,
ncclFloat32 = 7, ncclFloat = 7,
ncclFloat64 = 8, ncclDouble = 8,
ncclBfloat16 = 9,
ncclNumTypes = 10 } ncclDataType_t;
/*
* Collective communication operations
*
* Collective communication operations must be called separately for each
* communicator in a communicator clique.
*
* They return when operations have been enqueued on the CUDA stream.
*
* Since they may perform inter-CPU synchronization, each call has to be done
* from a different thread or process, or need to use Group Semantics (see
* below).
*/
/*
* Reduce
*
* Reduces data arrays of length count in sendbuff into recvbuff using op
* operation.
* recvbuff may be NULL on all calls except for root device.
* root is the rank (not the CUDA device) where data will reside after the
* operation is complete.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
/*
* (deprecated) Broadcast (in-place)
*
* Copies count values from root to all other devices.
* root is the rank (not the CUDA device) where data resides before the
* operation is started.
*
* This operation is implicitely in place.
*/
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
/*
* Broadcast
*
* Copies count values from root to all other devices.
* root is the rank (not the CUDA device) where data resides before the
* operation is started.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
/*
* All-Reduce
*
* Reduces data arrays of length count in sendbuff using op operation, and
* leaves identical copies of result on each recvbuff.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
/*
* Reduce-Scatter
*
* Reduces data in sendbuff using op operation and leaves reduced result
* scattered over the devices so that recvbuff on rank i will contain the i-th
* block of the result.
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
* should have a size of at least nranks*recvcount elements.
*
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
*/
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
hipStream_t stream);
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
hipStream_t stream);
/*
* All-Gather
*
* Each device gathers sendcount values from other GPUs into recvbuff,
* receiving data from rank i at offset i*sendcount.
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
* should have a size of at least nranks*sendcount elements.
*
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
*/
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
/*
* Group semantics
*
* When managing multiple GPUs from a single thread, and since NCCL collective
* calls may perform inter-CPU synchronization, we need to "group" calls for
* different ranks/devices into a single call.
*
* Grouping NCCL calls as being part of the same collective operation is done
* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
* collective calls until the ncclGroupEnd call, which will wait for all calls
* to be complete. Note that for collective communication, ncclGroupEnd only
* guarantees that the operations are enqueued on the streams, not that
* the operation is effectively done.
*
* Both collective communication and ncclCommInitRank can be used in conjunction
* of ncclGroupStart/ncclGroupEnd.
*/
/*
* Group Start
*
* Start a group call. All subsequent calls to NCCL may not block due to
* inter-CPU synchronization.
*/
ncclResult_t ncclGroupStart();
ncclResult_t pncclGroupStart();
/*
* Group End
*
* End a group call. Wait for all calls since ncclGroupStart to complete
* before returning.
*/
ncclResult_t ncclGroupEnd();
ncclResult_t pncclGroupEnd();
#ifdef __cplusplus
} // end extern "C"
#endif
#endif // end include guard
+44
Просмотреть файл
@@ -0,0 +1,44 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef UTILS_H_
#define UTILS_H_
struct allGather1Data_t {
struct ncclPeerInfo peerInfo;
struct ncclComm* comm;
};
struct allGather3Data_t {
int cudaCompCap;
int fullCudaCompCap;
int nvlink;
int nChannels;
struct {
int sameChannels;
int speedIntra;
int speedInter;
int nvlink;
} tree;
struct {
int sameChannels;
int speedIntra;
int speedInter;
int nvlink;
} ring;
struct ncclTopoRanks topoRanks;
};
ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data);
ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data,
struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph);
ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph);
#endif
+216
Просмотреть файл
@@ -0,0 +1,216 @@
/*
Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "nccl.h"
#include "channel.h"
#include "nvmlwrap.h"
#include "bootstrap.h"
#include "transport.h"
#include "group.h"
#include "net.h"
#include "graph.h"
#include "argcheck.h"
#include <sched.h>
#include <fcntl.h>
#include <unistd.h>
#include <hip/hip_runtime.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <dlfcn.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "model.h"
extern NodeModel *node_model;
static ncclResult_t dummyNetDevices(int* ndev) {
*ndev = node_model->getnNetDevs();
return ncclSuccess;
}
static ncclResult_t dummyNetPciPath(int dev, char** path) {
node_model->getNetPciPath(dev, path);
return ncclSuccess;
}
ncclNet_t ncclNetDummy = {
"IB",
0,
dummyNetDevices,
dummyNetPciPath,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
};
ncclNet_t* ncclNet = &ncclNetDummy;
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
return ncclSuccess;
}
/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
int busIdToCudaDev(int64_t busId) {
int cudaDev;
for (cudaDev = 0; cudaDev < node_model->getnGpus(); cudaDev++) {
if (node_model->getGpuBusId(cudaDev) == busId)
break;
}
if (cudaDev < node_model->getnGpus())
return cudaDev;
else
WARN("Invalid busId %lx", busId);
return 0;
}
/* Determine if two peers can communicate with P2P */
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
// Rule out different nodes
*ret = 0;
if (info1->hostHash != info2->hostHash) return ncclSuccess;
int cudaDev1 = busIdToCudaDev(info1->busId);
int cudaDev2 = busIdToCudaDev(info2->busId);
*ret = node_model->p2pCanConnect(cudaDev1, cudaDev2);
return ncclSuccess;
}
/* Send: Create and return connect structures for this peer to connect to me */
ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
if (myInfo->pidHash == peerInfo->pidHash) {
if (myInfo->cudaDev == peerInfo->cudaDev) {
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
return ncclInternalError;
} else {
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
}
} else {
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
//TRACE_DUMP_IPC(&info.devIpc);
}
return ncclSuccess;
}
/* Create and return connect structures for this peer to connect to me */
ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
return ncclSuccess;
}
struct ncclTransport p2pTransport = {
"P2P",
p2pCanConnect,
{ p2pSendSetup, NULL, NULL, NULL },
{ p2pRecvSetup, NULL, NULL, NULL }
};
/* Determine if two peers can communicate with SHM */
ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
// Rule out different nodes
*ret = 0;
if (info1->hostHash != info2->hostHash) return ncclSuccess;
int cudaDev1 = busIdToCudaDev(info1->busId);
int cudaDev2 = busIdToCudaDev(info2->busId);
*ret = node_model->shmCanConnect(cudaDev1, cudaDev2);
return ncclSuccess;
}
/* Create and return connect structures for this peer to connect to me */
ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
return ncclSuccess;
}
ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
return ncclSuccess;
}
struct ncclTransport shmTransport = {
"SHM",
shmCanConnect,
{ shmSendSetup, NULL, NULL, NULL },
{ shmRecvSetup, NULL, NULL, NULL }
};
/* Determine if two peers can communicate with NET */
ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = node_model->netCanConnect(info1->rank, info2->rank);
return ncclSuccess;
}
ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
int netDev, useGdr = 0;
NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &netDev));
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), netDev,
useGdr ? "/GDRDMA" : "");
return ncclSuccess;
}
NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
int netDev, useGdr = 0;
NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &netDev));
// Check if we are close enough that it makes sense to enable GDR
int netGdrLevel = ncclParamNetGdrLevel();
int distance;
NCCLCHECK(ncclTopoNetDistance(topo, myInfo->busId, netDev, &distance));
if (distance >= netGdrLevel) {
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), myInfo->busId, netDev, distance, netGdrLevel);
}
else
useGdr = 1;
INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), netDev,
useGdr ? "/GDRDMA" : "");
return ncclSuccess;
}
struct ncclTransport netTransport = {
"NET",
netCanConnect,
{ netSendSetup, NULL, NULL, NULL },
{ netRecvSetup, NULL, NULL, NULL }
};
struct ncclTransport ncclTransports[NTRANSPORTS] = {
p2pTransport,
shmTransport,
netTransport,
};
+231
Просмотреть файл
@@ -0,0 +1,231 @@
/*
Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "nccl.h"
#include "channel.h"
#include "nvmlwrap.h"
#include "bootstrap.h"
#include "transport.h"
#include "group.h"
#include "net.h"
#include "graph.h"
#include "argcheck.h"
#include "cpuset.h"
#include <sched.h>
#include <fcntl.h>
#include <unistd.h>
#include <hip/hip_runtime.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <dlfcn.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <cstdio>
#include <iostream>
#include <cstring>
#include "model.h"
#include "utils.h"
NodeModel *node_model;
char* getCmdOption(char ** begin, char ** end, const std::string & option) {
char ** itr = std::find(begin, end, option);
if (itr != end && ++itr != end)
{
return *itr;
}
return 0;
}
bool cmdOptionExists(char** begin, char** end, const std::string& option) {
return std::find(begin, end, option) != end;
}
const char *model_descriptions[] = {
"4 nodes with 8 GPUs PCIe 1 NIC",
"4 nodes with 8 GPUs PCIe 2 NIC",
"2 nodes VEGA20 4P1H",
"4 nodes with 8 VEGA20 GPUs XGMI 4P2H 1 NIC",
"single node gfx908 4P3L",
"single node gfx908 8P6L",
"single node gfx908 8P6L Alt. Connection",
"single node 8 GPUs PCIe on Rome",
"4 nodes 8 GPUs PCIe 2 NICs on Rome",
"3 nodes 8 GPUs PCIe + 1 Rome 8 GPUs PCIe + 2 nodes gfx908 4P3L",
NULL,
};
int main(int argc,char* argv[])
{
struct ncclComm *comm;
if (!cmdOptionExists(argv, argv + argc, "-m")) {
printf("Usage: ./topo_expl -m model_id\n");
printf("List of model_id:\n");
for (int i = 0; model_descriptions[i] != NULL; i++)
printf(" %d: %s\n", i, model_descriptions[i]);
exit(0);
}
int model_id = 0;
char *mi = getCmdOption(argv, argv + argc, "-m");
if (mi)
model_id = atol(mi);
// CPU, GPU and NIC devices on Skylake
CpuDevices skylake("Skylake", 12, 12, 12);
GpuDevices vg20_pcie(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_pcie);
GpuDevices vg20_4p1h(4, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_4p2h);
GpuDevices vg20_4p2h(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_4p2h);
GpuDevices gfx908_4p3l(4, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l);
GpuDevices gfx908_8p6l(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l);
GpuDevices gfx908_8p6l_1(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l_1);
NetDevices nic_1(1, netPciPaths_1, netGuids_1, netPciNumaIds_1);
NetDevices nic_1_1(1, netPciPaths_1_1, netGuids_1, netPciNumaIds_1);
NetDevices nic_2(2, netPciPaths_2, netGuids_2, netPciNumaIds_2);
// CPU, GPU and NIC devices on Rome
CpuDevices rome("Rome", 24, 24, 24);
GpuDevices vg20_pcie_rome(8, rome_busIds_8, rome_gpuPciPaths_8, rome_gpuPciNumaIds_8, conn_mat_rome);
NetDevices nic_1_rome(1, rome_netPciPaths_1, rome_netGuids_1, rom_netPciNumaIds_1);
NetDevices nic_2_rome(2, rome_netPciPaths_2, rome_netGuids_2, rom_netPciNumaIds_2);
// 8 GPUs PCIe 1 NIC
NodeModel model_8pcie_1nic(skylake, vg20_pcie, nic_1, "Skylake 8 GPUs PCIe");
// 8 GPUs PCIe 2 NIC
NodeModel model_8pcie_2nic(skylake, vg20_pcie, nic_2, "Skylake 8 GPUs PCIe 2 NIC");
// VEGA20 4P1H, use VEGA20 4P2H model
NodeModel model_vg20_4p1h_1nic(skylake, vg20_4p1h, nic_1, "Skylake VEGA20 4P1H");
// VEGA20 GPUs XGMI 4P2H 1 NIC
NodeModel model_vg20_4p2h_1nic(skylake, vg20_4p2h, nic_1_1, "Skylake VEGA20 4P2H");
// gfx908 4P3L
NodeModel model_gfx908_4p_1nic(skylake, gfx908_4p3l, nic_1, "Skylake gfx908 4P3L");
// gfx908 8P6L
NodeModel model_gfx908_8p_1nic(skylake, gfx908_8p6l, nic_1, "Skylake gfx908 8P6L");
// gfx908 8P6L alternative connection
NodeModel model_gfx908_8p_1nic_1(skylake, gfx908_8p6l_1, nic_1, "Skylake gfx908 8P6L Alt. Connection");
// 8 GPUs PCIe on Rome
NodeModel model_8pcie_1nic_rome(rome, vg20_pcie_rome, nic_1_rome, "Rome 8 GPUs PCIe");
// 8 GPUs PCIe 2 NICs on Rome
NodeModel model_8pcie_2nic_rome(rome, vg20_pcie_rome, nic_2_rome, "Rome 8 GPUs PCIe 2 NICs");
NetworkModel network;
switch(model_id) {
case 0:
for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_1nic);
break;
case 1:
for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_2nic);
break;
case 2:
for (int i = 0; i < 2; i ++) network.AddNode(model_vg20_4p1h_1nic);
break;
case 3:
for (int i = 0; i < 4; i ++) network.AddNode(model_vg20_4p2h_1nic);
break;
case 4:
network.AddNode(model_gfx908_4p_1nic);
break;
case 5:
network.AddNode(model_gfx908_8p_1nic);
break;
case 6:
network.AddNode(model_gfx908_8p_1nic_1);
break;
case 7:
network.AddNode(model_8pcie_1nic_rome);
break;
case 8:
for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_2nic_rome);
break;
case 9:
for (int i = 0; i < 3; i ++) network.AddNode(model_8pcie_1nic);
network.AddNode(model_8pcie_1nic_rome);
for (int i = 0; i < 2; i ++) network.AddNode(model_gfx908_4p_1nic);
break;
default:
printf("Invalid model_id %d\n", model_id);
exit(0);
}
printf("Generating topology using %d: %s\n", model_id, model_descriptions[model_id]);
int nranks = network.GetNRanks();
int nnodes = network.GetNNodes();
printf("nnodes = %d, nranks = %d\n", nnodes, nranks);
for (int i = 0; i < nranks; i++) {
node_model = network.GetNode(i);
assert(node_model!=0);
printf("Rank %d: node %d (%s) GPU busId %lx\n", i, node_model->nodeId,
node_model->description, node_model->getGpuBusId(node_model->rankToCudaDev(i)));
}
NCCLCHECK(ncclCalloc(&comm, nranks));
struct allGather1Data_t *allGather1Data;
NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
struct allGather3Data_t *allGather3Data;
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
for (int i = 0; i < nranks; i++) {
comm[i].rank = i;
comm[i].nRanks = nranks;
node_model = network.GetNode(i);
assert(node_model!=0);
bootstrapAllGather(&comm[i], allGather1Data);
}
struct ncclTopoGraph treeGraph, ringGraph;
for (int i = 0; i < nranks; i++) {
node_model = network.GetNode(i);
assert(node_model!=0);
initTransportsRank_1(&comm[i], allGather1Data, allGather3Data, treeGraph, ringGraph);
}
for (int i = 0; i < nranks; i++) {
node_model = network.GetNode(i);
assert(node_model!=0);
initTransportsRank_3(&comm[i], allGather3Data, treeGraph, ringGraph);
}
free(allGather3Data);
free(allGather1Data);
free(comm);
printf("Done generating topology using %d: %s\n", model_id, model_descriptions[model_id]);
return 0;
}
+420
Просмотреть файл
@@ -0,0 +1,420 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nccl.h"
#include "channel.h"
#include "nvmlwrap.h"
#include "bootstrap.h"
#include "transport.h"
#include "group.h"
#include "net.h"
#include "graph.h"
#include "argcheck.h"
#include <sched.h>
#include <fcntl.h>
#include <unistd.h>
#include <hip/hip_runtime.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <dlfcn.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "model.h"
#include "utils.h"
extern NodeModel *node_model;
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
// Get current Compute Capability
int ncclCudaCompCap() {
int ccMajor = 1, ccMinor = 0;
return ccMajor*10+ccMinor;
}
ncclResult_t int64ToBusId(int64_t id, char* busId) {
sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
return ncclSuccess;
}
ncclResult_t busIdToInt64(char* busId, int64_t* id) {
const int size = strlen(busId);
char* hexStr;
NCCLCHECK(ncclCalloc(&hexStr, size));
int hexOffset = 0;
for (int i=0; i<size; i++) {
char c = busId[i];
if (c == '.' || c == ':') continue;
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
hexStr[hexOffset++] = busId[i];
} else break;
}
hexStr[hexOffset] = '\0';
*id = strtol(hexStr, NULL, 16);
free(hexStr);
return ncclSuccess;
}
int ncclDebugLevel = -1;
void ncclDebugInit() {
if (ncclDebugLevel != -1) return;
const char* nccl_debug = getenv("NCCL_DEBUG");
if (nccl_debug == NULL) {
ncclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
ncclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
ncclDebugLevel = NCCL_LOG_WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
ncclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
ncclDebugLevel = NCCL_LOG_ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
ncclDebugLevel = NCCL_LOG_TRACE;
}
}
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (ncclDebugLevel == -1) ncclDebugInit();
if (level == NCCL_LOG_TRACE && ncclDebugLevel != NCCL_LOG_TRACE) return;
char buffer[1024];
size_t len;
len = snprintf(buffer, sizeof(buffer),
"[%d:%d] ", node_model->nodeId, node_model->currRank);
va_list args;
va_start(args, fmt);
vsprintf(buffer+len, fmt, args);
va_end(args);
printf("%s\n", buffer);
if (level == NCCL_LOG_WARN) {
fprintf(stderr,"[%d:%d] %s:%d TOPO EXPL ABORT\n",
node_model->nodeId, node_model->currRank, filefunc, line);
abort();
}
}
ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data) {
// AllGather1 - begin
allGather1Data[comm->rank].peerInfo.rank = comm->rank;
allGather1Data[comm->rank].peerInfo.cudaDev = node_model->rankToCudaDev(comm->rank);
allGather1Data[comm->rank].peerInfo.gdrSupport = 1;
allGather1Data[comm->rank].peerInfo.hostHash = node_model->hostHash;
allGather1Data[comm->rank].peerInfo.pidHash = node_model->pidHash;
allGather1Data[comm->rank].peerInfo.shmDev = 0x19;
allGather1Data[comm->rank].peerInfo.busId = node_model->getGpuBusId(node_model->rankToCudaDev(comm->rank));
return ncclSuccess;
}
ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data,
struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph) {
// We use 3 AllGathers
// 1. { peerInfo, comm }
// 2. ConnectTransport[nranks], ConnectValue[nranks]
// 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
int rank = comm->rank;
int nranks = comm->nRanks;
//uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
//TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
//NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
// AllGather1 - begin
//struct allGather1Data_t *allGather1Data;
//NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
//allGather1Data[rank].comm = comm;
struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
//NCCLCHECK(fillInfo(comm, myInfo, commHash));
//NCCLCHECK(bootstrapAllGather(comm, allGather1Data));
NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
for (int i = 0; i < nranks; i++) {
memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
return ncclInvalidUsage;
}
}
// AllGather1 data is used again below
// AllGather1 - end
// Topo detection / System graph creation
NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
// Compute paths between GPUs and NICs
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
// Remove inaccessible GPUs and unused NICs
NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
// Recompute paths after trimming
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
// Compute max speed to accelerate search
NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo));
// Print final topology
NCCLCHECK(ncclTopoPrint(comm->topo));
// Get rings and trees
//struct ncclTopoGraph treeGraph;
treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
treeGraph.crossNic = ncclParamCrossNic();
// We communicate only half the data between node with trees on 2 nodes.
NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
//struct ncclTopoGraph ringGraph;
ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
ringGraph.crossNic = ncclParamCrossNic();
NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph));
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph));
// AllGather3 - begin
allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
allGather3Data[rank].nvlink = treeGraph.nvlink;
allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
allGather3Data[rank].tree.nvlink = treeGraph.nvlink;
allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
allGather3Data[rank].ring.nvlink = ringGraph.nvlink;
NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks));
//INFO(NCCL_GRAPH, "%d: nvlink %d nChannels %d tree.sameChannels %d tree.speedIntra %d tree.speedInter %d tree.nvlink %d ring.sameChannels %d ring.speedIntra %d ring.speedInter %d ring.nvlink %d",
// rank, allGather3Data[rank].nvlink, allGather3Data[rank].nChannels, allGather3Data[rank].tree.sameChannels, allGather3Data[rank].tree.speedIntra, allGather3Data[rank].tree.speedInter, allGather3Data[rank].tree.nvlink,
// allGather3Data[rank].ring.sameChannels, allGather3Data[rank].ring.speedIntra, allGather3Data[rank].ring.speedInter, allGather3Data[rank].ring.nvlink);
//INFO(NCCL_GRAPH, " ringRecv %d ringSend %d ringPrev %d ringNext %d treeUpRecv %d treeUpSend %d treeDnRecv %d treeDnSend %d",
// allGather3Data[rank].topoRanks.ringRecv[0], allGather3Data[rank].topoRanks.ringSend[0], allGather3Data[rank].topoRanks.ringPrev[0], allGather3Data[rank].topoRanks.ringNext[0],
// allGather3Data[rank].topoRanks.treeUpRecv[0], allGather3Data[rank].topoRanks.treeUpSend[0], allGather3Data[rank].topoRanks.treeDnRecv[0], allGather3Data[rank].topoRanks.treeDnSend[0]);
return ncclSuccess;
}
template <int type>
static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
for (int t=0; t<NTRANSPORTS; t++) {
struct ncclTransport *transport = ncclTransports+t;
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
int ret = 0;
NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
if (ret) {
//cpu_set_t affinitySave;
//sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
//int cudaDev;
//CUDACHECK(hipGetDevice(&cudaDev));
//setCpuAffinity(cudaDev);
connector->transportComm = transportComm;
NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId));
//sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
return ncclSuccess;
}
}
WARN("No transport found !");
return ncclInternalError;
}
static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
struct ncclConnect connect;
struct ncclConnector* conn;
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
if (peer == -1) continue;
conn = &channel->peers[peer].recv;
if (conn->connected) { ++nSkippedRecv; continue; }
memset(&connect, 0, sizeof(connect));
NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
//NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
if (peer == -1) continue;
conn = &channel->peers[peer].send;
if (conn->connected) { ++nSkippedSend; continue; }
memset(&connect, 0, sizeof(connect));
NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
//NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
if (peer == -1) continue;
conn = &channel->peers[peer].send;
if (conn->connected) {++nSkippedSend; continue; }
memset(&connect, 0, sizeof(connect));
//NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
//NCCLCHECK(conn->transportComm->connect(&connect, conn));
conn->connected = 1;
}
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
if (peer == -1) continue;
conn = &channel->peers[peer].recv;
if (conn->connected) {++nSkippedRecv; continue; }
memset(&connect, 0, sizeof(connect));
//CCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
//NCCLCHECK(conn->transportComm->connect(&connect, conn));
conn->connected = 1;
}
TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
return ncclSuccess;
}
ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
struct ncclChannel* channel = comm->channels+channelid;
channel->id = channelid;
// Setup intermediate buffering
//channel->buffSize = ncclParamBuffsize();
// Ring index to user rank table.
//NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
// Communication structures with peers.
//NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
for (size_t i=0; i<comm->nRanks; ++i) {
channel->peers[i].send.comm = comm;
channel->peers[i].recv.comm = comm;
}
// Per-channel operation list.
//NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
return ncclSuccess;
}
static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) {
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
NCCLCHECK(initChannel(comm, channelId));
struct ncclRing* ring = &comm->channels[channelId].ring;
// Reorganize ranks to start with rank.
int shift;
for (shift = 0; shift<nranks; shift++) {
if (ringRanks[shift] == rank) {
break;
}
}
for (int i=0; i<nranks; i++) {
ring->userRanks[i] = ringRanks[(i+shift)%nranks];
}
return ncclSuccess;
}
ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph,
struct ncclTopoGraph& ringGraph) {
int rank = comm->rank;
int nranks = comm->nRanks;
//NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
// Determine nNodes, firstRanks, ...
int* nodesFirstRank;
NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
for (int i=0; i<nranks; i++) {
int node = -1;
int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
for (int n=0; n<comm->nNodes; n++) {
if (nodesFirstRank[n] == firstRank) node = n;
}
if (node == -1) {
node = comm->nNodes++;
nodesFirstRank[node] = firstRank;
}
if (i == comm->rank) comm->node = node;
}
// Determine the minimum CUDA Compute capability of all GPUs
int myCompCap = allGather3Data[rank].cudaCompCap;
int minCompCap = myCompCap, maxCompCap = myCompCap;
for (int i = 0; i < nranks; i++) {
minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
}
comm->nvlink = 1;
for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink;
int nChannelsOrig = comm->nChannels;
struct ncclTopoRanks** allTopoRanks;
NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
for (int i=0; i<nranks; i++) {
allTopoRanks[i] = &allGather3Data[i].topoRanks;
// Make sure we align all ranks so that the tuning is consistent across ranks
treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink);
ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink);
}
if (comm->nChannels < nChannelsOrig) {
// We started duplicating channels during Preset(), so we need to move the
// duplicated channels since we have removed some.
for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
}
int *rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
char line[1024];
sprintf(line, "nodesFirstRank: ");
int offset = strlen(line);
for (int i=0; i<comm->nNodes; i++) {
sprintf(line+offset, "%d ", nodesFirstRank[i]);
offset = strlen(line);
}
INFO(NCCL_INIT, "%s", line);
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
free(allTopoRanks);
free(nodesFirstRank);
// AllGather3 - end
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
line[0]='\0';
for (int c=0; c<comm->nChannels; c++) {
struct ncclTree* treeUp = &comm->channels[c].treeUp;
struct ncclTree* treeDn = &comm->channels[c].treeDn;
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d",
c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up,
treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]);
}
line[1023] = '\0';
INFO(NCCL_INIT, "Trees%s", line);
free(rings);
// Done with AllGather1 data
//free(allGather1Data);
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
// Connect with prev/next for each ring
struct ncclConnect *connect;
NCCLCHECK(ncclCalloc(&connect, 2));
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks));
if (comm->nRanks == 1) continue;
NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up));
NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down));
}
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
free(connect);
return ncclSuccess;
}