diff --git a/projects/rccl/src/graph/topo.cc b/projects/rccl/src/graph/topo.cc index 04390940f4..328c9a721b 100644 --- a/projects/rccl/src/graph/topo.cc +++ b/projects/rccl/src/graph/topo.cc @@ -30,9 +30,15 @@ const char* topoLinkTypeStr[] = { "LOC", "XGMI", "PCI", "QPI", "NET" }; const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" }; #endif +#ifdef TOPO_EXPL +#include "model.h" +extern NodeModel *node_model; +#endif + /******************************************************************/ /******************* Graph Creation Functions *********************/ /******************************************************************/ +#ifndef TOPO_EXPL static int getNumaId(char *path) { char npath[PATH_MAX]; snprintf(npath, PATH_MAX, "%s/numa_node", path); @@ -59,6 +65,15 @@ static ncclResult_t getPciPath(char* busId, char** path) { } return ncclSuccess; } +#else +static int getNumaId(char *path) { + return node_model->getNumaId(path); +} + +static ncclResult_t getPciPath(char* busId, char** path) { + return node_model->getGpuPciPath(busId, path); +} +#endif // Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000. ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { @@ -102,6 +117,7 @@ int interCpuWidth = 0; int cpuPciWidth = 0; int p2pPciWidth = 0; +#ifndef TOPO_EXPL static ncclResult_t getCpuWidths() { // Check if already detected if (interCpuWidth + cpuPciWidth + p2pPciWidth) return ncclSuccess; @@ -182,6 +198,14 @@ static ncclResult_t getCpuWidths() { INFO(NCCL_GRAPH, "%s CPU (CPU-PCI %d, PCI/P2P %d, InterCpu %d)", cpu, cpuPciWidth, p2pPciWidth, interCpuWidth); return ncclSuccess; } +#else +static ncclResult_t getCpuWidths() { + char cpu[256]; + node_model->getCpuWidths(cpu, &interCpuWidth, &cpuPciWidth, &p2pPciWidth); + TRACE(NCCL_GRAPH, "%s CPU (CPU-PCI %d, PCI/P2P %d, InterCpu %d)", cpu, cpuPciWidth, p2pPciWidth, interCpuWidth); + return ncclSuccess; +} +#endif static ncclResult_t ncclTopoGetInterCpuWidth(int* width) { NCCLCHECK(getCpuWidths()); @@ -272,7 +296,11 @@ ncclResult_t ncclTopoConnectXGMI(struct ncclComm* comm, struct ncclTopoSystem* s uint32_t link_type, hops; int cudaDev1 = busIdToCudaDev(comm->peerInfo[gpu1->rank].busId); int cudaDev2 = busIdToCudaDev(comm->peerInfo[gpu2->rank].busId); +#ifndef TOPO_EXPL if (hipExtGetLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) == hipSuccess) { +#else + if (node_model->getLinkTypeAndHopCount(cudaDev1, cudaDev2, &link_type, &hops) == hipSuccess) { +#endif if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1) { NCCLCHECK(ncclTopoConnectNodes(gpu1, gpu2, LINK_NVL, minWidth)); } @@ -424,6 +452,7 @@ ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTop // Try to detect if IB cards are in fact the same physical NIC, hence sharing ports. #include #define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid" +#ifndef TOPO_EXPL uint64_t getIbGuid(char* path) { uint64_t guid = 0ULL; char guidPath[PATH_MAX]; @@ -446,6 +475,11 @@ uint64_t getIbGuid(char* path) { } return guid; } +#else +uint64_t getIbGuid(char* path) { + return node_model->getIbGuid(path); +} +#endif struct netInfo { char* path; diff --git a/projects/rccl/tools/topo_expl/Makefile b/projects/rccl/tools/topo_expl/Makefile new file mode 100644 index 0000000000..e19ded6d27 --- /dev/null +++ b/projects/rccl/tools/topo_expl/Makefile @@ -0,0 +1,19 @@ +# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. +HIP_PATH ?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) +HIP_PATH = ../../.. +endif +HIPCC = $(HIP_PATH)/bin/hipcc + +EXE = topo_expl +CXXFLAGS = -g -O3 -Iinclude -I../../src/include -I../../src/graph/ -DTOPO_EXPL -DENABLE_TRACE + +files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc ../../src/graph/search.cc ../../src/graph/connect.cc + +all: $(EXE) + +$(EXE): $(files) + $(HIPCC) $(CXXFLAGS) $^ -o $@ + +clean: + rm -f *.o $(EXE) diff --git a/projects/rccl/tools/topo_expl/include/model.h b/projects/rccl/tools/topo_expl/include/model.h new file mode 100644 index 0000000000..116febad1d --- /dev/null +++ b/projects/rccl/tools/topo_expl/include/model.h @@ -0,0 +1,419 @@ +/* +Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef MODEL_H_ +#define MODEL_H_ + +class CpuDevices { +private: + char *cpuName; + int interCpuWidth; + int cpuPciWidth; + int p2pPciWidth; + +public: + CpuDevices(const char *cpuname, const int intercpuwidth, const int cpupciwidth, const int p2ppciwidth) : + cpuName((char *)cpuname), interCpuWidth(intercpuwidth), cpuPciWidth(cpupciwidth), p2pPciWidth(p2ppciwidth) {} + + CpuDevices() : cpuName(0), interCpuWidth(0), cpuPciWidth(0), p2pPciWidth(0) {} + + ncclResult_t getCpuWidths(char* name, int* interCpu, int* cpuPci, int* p2pPci) { + strcpy(name, cpuName); + *interCpu = interCpuWidth; + *cpuPci = cpuPciWidth; + *p2pPci = p2pPciWidth; + return ncclSuccess; + } +}; + +class GpuDevices { +private: + int nGpus; + uint64_t *busIds; + char **gpuPciPaths; + int *gpuNumaIds; + int *connMatrix; + +public: + GpuDevices(const int ngpus, const uint64_t *busids, const char **gpupcipaths, const int *gpunumaids, const int *connmatrix) : + nGpus(ngpus), busIds((uint64_t *)busids), gpuPciPaths((char **)gpupcipaths), gpuNumaIds((int *)gpunumaids), connMatrix((int *)connmatrix) {} + + GpuDevices () : nGpus(0), busIds(0), gpuPciPaths(0), gpuNumaIds(0), connMatrix(0) {} + + int getnDevs() { return nGpus; } + + uint64_t getBusId(int dev) { return busIds[dev]; } + + ncclResult_t getPciPath(char* busId, char** path) { + char tempBusId[] = "0000:00:00.0"; + *path = (char *)malloc(PATH_MAX); + int i; + for (i = 0; i < nGpus; i++) { + NCCLCHECK(int64ToBusId(busIds[i], tempBusId)); + if (strcmp(busId, tempBusId) == 0) + break; + } + if (i < nGpus) + strcpy(*path, gpuPciPaths[i]); + else { + WARN("Could not find real path of %s", busId); + return ncclSystemError; + } + return ncclSuccess; + } + + int p2pCanConnect(int device1, int device2) { + // connection matrix are 8 GPUs + int dist = connMatrix[device1*8+device2]; + if (dist == 255) + return 0; + //if (dist%15 == 0 && dist/15 != 1) { + // return 0; + //} + return 1; + }; + + hipError_t getLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) { + // connection matrix are 8 GPUs + int dist = connMatrix[device1*8+device2]; + + if (dist%15 == 0) { + *linktype = 4; + *hopcount = dist/15; + } + else if (dist%20 == 0) { + *linktype = 2; + *hopcount = dist/20; + } + else if (dist%36 == 0) { + *linktype = 1; + *hopcount = dist/36; + } + return hipSuccess; + } + + virtual int getNumaId(char *path) { + int n; + // search for all GPUs + for (n = 0; n < nGpus; n++) + if (strcmp(path, gpuPciPaths[n]) == 0) + break; + if (n < nGpus) + return gpuNumaIds[n]; + return -1; + } +}; + +class NetDevices { +private: + int nNetDevs; + char **netPciPaths; + uint64_t *netGuids; // IB ports on same card share the same GUID + int *netNumaIds; + +public: + NetDevices(const int nnetdevs, const char **netpcipaths, const uint64_t *netguids, const int *netnumaids) : + nNetDevs(nnetdevs), netPciPaths((char **)netpcipaths), netGuids((uint64_t *)netguids), netNumaIds((int *)netnumaids) {} + + NetDevices() : nNetDevs(0), netPciPaths(0), netGuids(0), netNumaIds(0) {} + + int getnDevs() { return nNetDevs; } + + ncclResult_t getPciPath(int dev, char** path) { + *path = (char *)malloc(PATH_MAX); + if (dev < nNetDevs) + strcpy(*path, netPciPaths[dev]); + else { + WARN("Could not find real path of %d", dev); + return ncclSystemError; + } + return ncclSuccess; + } + + virtual int getNumaId(char *path) { + int n; + // search for all NICs + for (n = 0; n < nNetDevs; n++) + if (strcmp(path, netPciPaths[n]) == 0) + break; + if (n < nNetDevs) + return netNumaIds[n]; + return -1; + } + + uint64_t getIbGuid(char* path) { + int n; + for (n = 0; n < nNetDevs; n++) + if (strcmp(path, netPciPaths[n]) == 0) + break; + if (n < nNetDevs) + return netGuids[n]; + WARN("Invalid IB path %s", path); + return 0; + } +}; + +class NodeModel { +private: + CpuDevices cpus; + GpuDevices gpus; + NetDevices netdevs; + +public: + int nodeId; + int currRank; + int firstRank; + uint64_t hostHash; // auto-generated + uint64_t pidHash; // auto-generated + char description[256]; + + int rankToCudaDev(int rank) { return rank - firstRank; } + + int getnGpus() { return gpus.getnDevs(); } + + int getnNetDevs() { return netdevs.getnDevs(); } + + ncclResult_t getGpuPciPath(char* busId, char** path) { + return gpus.getPciPath(busId, path); + } + + ncclResult_t getNetPciPath(int dev, char** path) { + + return netdevs.getPciPath(dev, path); + } + + uint64_t getGpuBusId(int dev) { + return gpus.getBusId(dev); + } + + int p2pCanConnect(int device1, int device2) { return gpus.p2pCanConnect(device1, device2); } + + hipError_t getLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) { + return gpus.getLinkTypeAndHopCount(device1, device2, linktype, hopcount); + } + + uint64_t getIbGuid(char* path) { + return netdevs.getIbGuid(path); + } + + int shmCanConnect(int device1, int device2) { return 1; } + int netCanConnect(int device1, int device2) { return 1; } + + virtual int getNumaId(char *path) { + int numa = gpus.getNumaId(path); + if (numa != -1) return numa; + numa = netdevs.getNumaId(path); + if (numa != -1) return numa; + WARN("Invalid path %s for getNumaId", path); + return 0; + } + + virtual ncclResult_t getCpuWidths(char* name, int* interCpu, int* cpuPci, int* p2pPci) { + return cpus.getCpuWidths(name, interCpu, cpuPci, p2pPci); + } + + NodeModel(CpuDevices cpu, GpuDevices gpu, NetDevices net, const char *desc) : + cpus(cpu), gpus(gpu), netdevs(net) { + strncpy(description, desc, 256); + } + + NodeModel() {} + + ~NodeModel() {} +}; + +class NetworkModel { +private: + int nNodes; + int nRanks; + NodeModel nodes[NCCL_TOPO_MAX_NODES]; + +public: + void AddNode(NodeModel node) { + nodes[nNodes] = node; + nodes[nNodes].nodeId = nNodes; + nodes[nNodes].firstRank = nRanks; + nodes[nNodes].hostHash = ((uint64_t)rand() << 32) | rand(); + nodes[nNodes].pidHash = ((uint64_t)rand() << 32) | rand(); + nNodes++; + nRanks += node.getnGpus(); + } + + int GetNNodes() { return nNodes; } + + int GetNRanks() { return nRanks; } + + NodeModel* GetNode(int rank) { + int node_id; + + if(rank < 0 || rank >= nRanks) + return 0; + + for(node_id = nNodes-1; node_id >= 0; node_id--) + if(rank >= nodes[node_id].firstRank) break; + + if (node_id >= 0) { + nodes[node_id].currRank = rank; + return nodes+node_id; + } + else + return 0; + } + + NetworkModel() : nNodes(0), nRanks(0) {} +}; + + +const static uint64_t busIds_8[] = { 0x1d000, 0x20000, 0x23000, 0x26000, 0x3f000, 0x43000, 0x46000, 0x49000 }; + +const static char* gpuPciPaths_8[] = { + "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:08.0/0000:1b:00.0/0000:1c:00.0/0000:1d:00.0", + "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:0c.0/0000:1e:00.0/0000:1f:00.0/0000:20:00.0", + "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:10.0/0000:21:00.0/0000:22:00.0/0000:23:00.0", + "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:14.0/0000:24:00.0/0000:25:00.0/0000:26:00.0", + "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:04.0/0000:3d:00.0/0000:3e:00.0/0000:3f:00.0", + "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:0c.0/0000:41:00.0/0000:42:00.0/0000:43:00.0", + "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:10.0/0000:44:00.0/0000:45:00.0/0000:46:00.0", + "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:14.0/0000:47:00.0/0000:48:00.0/0000:49:00.0", +}; + +const static int gpuPciNumaIds_8[] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + +const static char* netPciPaths_1[] = { + "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:04.0/0000:1a:00.0", +}; + +const static char* netPciPaths_1_1[] = { + "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:08.0/0000:4c:00.0", +}; + +const static uint64_t netGuids_1[] = { + 0xb8599f030007053aL, +}; + +const static int netPciNumaIds_1[] = { 0 }; + +const static char* netPciPaths_2[] = { + "/sys/devices/pci0000:17/0000:17:00.0/0000:18:00.0/0000:19:04.0/0000:1a:00.0", + "/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:08.0/0000:4c:00.0", +}; + +const static uint64_t netGuids_2[] = { + 0xb8599f030007053aL, + 0x506b4b030027bbf2L, +}; + +const static int netPciNumaIds_2[] = { 0, 0 }; + +const static uint64_t rome_busIds_8[] = { 0x63000, 0x23000, 0x26000, 0x03000, 0xe3000, 0xc3000, 0xc6000, 0xa3000 }; + +const static char* rome_gpuPciPaths_8[] = { + "/sys/devices/pci0000:60/0000:60:03.1/0000:61:00.0/0000:62:00.0/0000:63:00.0", + "/sys/devices/pci0000:20/0000:20:01.1/0000:21:00.0/0000:22:00.0/0000:23:00.0", + "/sys/devices/pci0000:20/0000:20:03.1/0000:24:00.0/0000:25:00.0/0000:26:00.0", + "/sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0", + "/sys/devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/0000:e2:00.0/0000:e3:00.0", + "/sys/devices/pci0000:c0/0000:c0:01.1/0000:c1:00.0/0000:c2:00.0/0000:c3:00.0", + "/sys/devices/pci0000:c0/0000:c0:03.1/0000:c4:00.0/0000:c5:00.0/0000:c6:00.0", + "/sys/devices/pci0000:a0/0000:a0:03.1/0000:a1:00.0/0000:a2:00.0/0000:a3:00.0", +}; + +const static int rome_gpuPciNumaIds_8[] = { 0, 0, 0, 0, 4, 4, 4, 4 }; + +const static char* rome_netPciPaths_1[] = { + "/sys/devices/pci0000:40/0000:40:01.1/0000:41:00.0", +}; + +const static uint64_t rome_netGuids_1[] = { + 0xb8599f030007053aL, +}; + +const static int rom_netPciNumaIds_1[] = { 0 }; + +const static char* rome_netPciPaths_2[] = { + "/sys/devices/pci0000:40/0000:40:01.1/0000:41:00.0", + "/sys/devices/pci0000:80/0000:80:01.1/0000:81:00.0", +}; + +const static uint64_t rome_netGuids_2[] = { + 0xb8599f030007053aL, + 0x506b4b030027bbf2L, +}; + +const static int rom_netPciNumaIds_2[] = { 0, 4 }; + +const int conn_mat_pcie[64] = { + 0 , 40, 40, 40, 40, 40, 40, 40, + 40, 0 , 40, 40, 40, 40, 40, 40, + 40, 40, 0 , 40, 40, 40, 40, 40, + 40, 40, 40, 0 , 40, 40, 40, 40, + 40, 40, 40, 40, 0 , 40, 40, 40, + 40, 40, 40, 40, 40, 0 , 40, 40, + 40, 40, 40, 40, 40, 40, 0 , 40, + 40, 40, 40, 40, 40, 40, 40, 0 , +}; + +const int conn_mat_4p2h[64] = { + 0 , 15, 15, 30, 40, 40, 40, 40, + 15, 0 , 30, 15, 40, 40, 40, 40, + 15, 30, 0 , 15, 40, 40, 40, 40, + 30, 15, 15, 0 , 40, 40, 40, 40, + 40, 40, 40, 40, 0 , 15, 15, 30, + 40, 40, 40, 40, 15, 0 , 30, 15, + 40, 40, 40, 40, 15, 30, 0 , 15, + 40, 40, 40, 40, 30, 15, 15, 0 , +}; + +const int conn_mat_8p6l[64] = { + 0 , 15, 15, 15, 15, 30, 15, 15, + 15, 0 , 15, 15, 30, 15, 15, 15, + 15, 15, 0 , 15, 15, 15, 15, 30, + 15, 15, 15, 0 , 15, 15, 30, 15, + 15, 30, 15, 15, 0 , 15, 15, 15, + 30, 15, 15, 15, 15, 0 , 15, 15, + 15, 15, 15, 30, 15, 15, 0 , 15, + 15, 15, 30, 15, 15, 15, 15, 0 , +}; + +const int conn_mat_8p6l_1[64] = { + 0 , 15, 15, 30, 15, 15, 15, 15, + 15, 0 , 30, 15, 15, 15, 15, 15, + 15, 30, 0 , 15, 15, 15, 15, 15, + 30, 15, 15, 0 , 15, 15, 15, 15, + 15, 15, 15, 15, 0 , 15, 15, 30, + 15, 15, 15, 15, 15, 0 , 30, 15, + 15, 15, 15, 15, 15, 30, 0 , 15, + 15, 15, 15, 15, 30, 15, 15, 0 , +}; + +const int conn_mat_rome[64] = { + 0 , 40, 40, 40, 72, 72, 72, 72, + 40, 0 , 40, 40, 72, 72, 72, 72, + 40, 40, 0 , 40, 72, 72, 72, 72, + 40, 40, 40, 0 , 72, 72, 72, 72, + 72, 72, 72, 72, 0 , 40, 40, 40, + 72, 72, 72, 72, 40, 0 , 40, 40, + 72, 72, 72, 72, 40, 40, 0 , 40, + 72, 72, 72, 72, 40, 40, 40, 0 , +}; + +#endif \ No newline at end of file diff --git a/projects/rccl/tools/topo_expl/include/nccl.h b/projects/rccl/tools/topo_expl/include/nccl.h new file mode 100644 index 0000000000..23ffec5283 --- /dev/null +++ b/projects/rccl/tools/topo_expl/include/nccl.h @@ -0,0 +1,267 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_H_ +#define NCCL_H_ + +#include +#include + +#define NCCL_MAJOR 2 +#define NCCL_MINOR 5 +#define NCCL_PATCH 6 +#define NCCL_SUFFIX "" + +#define NCCL_VERSION_CODE 2506 +#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z)) + +#define RCCL_BFLOAT16 1 + +#ifdef __cplusplus +extern "C" { +#endif + +/* Opaque handle to communicator */ +typedef struct ncclComm* ncclComm_t; + +#define NCCL_UNIQUE_ID_BYTES 128 +typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; + +/* Error type */ +typedef enum { ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclNumResults = 6 } ncclResult_t; + +/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. + * This integer is coded with the MAJOR, MINOR and PATCH level of the + * NCCL library + */ +ncclResult_t ncclGetVersion(int *version); +ncclResult_t pncclGetVersion(int *version); + +/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be + * called once and the Id should be distributed to all ranks in the + * communicator before calling ncclCommInitRank. */ +ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); +ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); + +/* Creates a new communicator (multi thread/process version). + * rank must be between 0 and nranks-1 and unique within a communicator clique. + * Each rank is associated to a CUDA device, which has to be set before calling + * ncclCommInitRank. + * ncclCommInitRank implicitly syncronizes with other ranks, so it must be + * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ +ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); +ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); + +/* Creates a clique of communicators (single process version). + * This is a convenience function to create a single-process communicator clique. + * Returns an array of ndev newly initialized communicators in comm. + * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). + * If devlist is NULL, the first ndev CUDA devices are used. + * Order of devlist defines user-order of processors within the communicator. */ +ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); +ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); + +/* Frees resources associated with communicator object, but waits for any operations + * that might still be running on the device. */ +ncclResult_t ncclCommDestroy(ncclComm_t comm); +ncclResult_t pncclCommDestroy(ncclComm_t comm); + +/* Frees resources associated with communicator object and aborts any operations + * that might still be running on the device. */ +ncclResult_t ncclCommAbort(ncclComm_t comm); +ncclResult_t pncclCommAbort(ncclComm_t comm); + +/* Returns a human-readable error message. */ +const char* ncclGetErrorString(ncclResult_t result); +const char* pncclGetErrorString(ncclResult_t result); + +/* Checks whether the comm has encountered any asynchronous errors */ +ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); +ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); + +/* Gets the number of ranks in the communicator clique. */ +ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); +ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); + +/* Returns the cuda device number associated with the communicator. */ +ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); +ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); + +/* Returns the user-ordered "rank" associated with the communicator. */ +ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); +ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); + +/* Reduction operation selector */ +typedef enum { ncclSum = 0, + ncclProd = 1, + ncclMax = 2, + ncclMin = 3, + ncclNumOps = 4 } ncclRedOp_t; + +/* Data types */ +typedef enum { ncclInt8 = 0, ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, ncclHalf = 6, + ncclFloat32 = 7, ncclFloat = 7, + ncclFloat64 = 8, ncclDouble = 8, + ncclBfloat16 = 9, + ncclNumTypes = 10 } ncclDataType_t; + +/* + * Collective communication operations + * + * Collective communication operations must be called separately for each + * communicator in a communicator clique. + * + * They return when operations have been enqueued on the CUDA stream. + * + * Since they may perform inter-CPU synchronization, each call has to be done + * from a different thread or process, or need to use Group Semantics (see + * below). + */ + +/* + * Reduce + * + * Reduces data arrays of length count in sendbuff into recvbuff using op + * operation. + * recvbuff may be NULL on all calls except for root device. + * root is the rank (not the CUDA device) where data will reside after the + * operation is complete. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); +ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); + +/* + * (deprecated) Broadcast (in-place) + * + * Copies count values from root to all other devices. + * root is the rank (not the CUDA device) where data resides before the + * operation is started. + * + * This operation is implicitely in place. + */ +ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, hipStream_t stream); +ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, hipStream_t stream); + +/* + * Broadcast + * + * Copies count values from root to all other devices. + * root is the rank (not the CUDA device) where data resides before the + * operation is started. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, hipStream_t stream); +ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, hipStream_t stream); + +/* + * All-Reduce + * + * Reduces data arrays of length count in sendbuff using op operation, and + * leaves identical copies of result on each recvbuff. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); +ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); + +/* + * Reduce-Scatter + * + * Reduces data in sendbuff using op operation and leaves reduced result + * scattered over the devices so that recvbuff on rank i will contain the i-th + * block of the result. + * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff + * should have a size of at least nranks*recvcount elements. + * + * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + */ +ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + hipStream_t stream); +ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + hipStream_t stream); + +/* + * All-Gather + * + * Each device gathers sendcount values from other GPUs into recvbuff, + * receiving data from rank i at offset i*sendcount. + * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff + * should have a size of at least nranks*sendcount elements. + * + * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + */ +ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); +ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); + +/* + * Group semantics + * + * When managing multiple GPUs from a single thread, and since NCCL collective + * calls may perform inter-CPU synchronization, we need to "group" calls for + * different ranks/devices into a single call. + * + * Grouping NCCL calls as being part of the same collective operation is done + * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all + * collective calls until the ncclGroupEnd call, which will wait for all calls + * to be complete. Note that for collective communication, ncclGroupEnd only + * guarantees that the operations are enqueued on the streams, not that + * the operation is effectively done. + * + * Both collective communication and ncclCommInitRank can be used in conjunction + * of ncclGroupStart/ncclGroupEnd. + */ + +/* + * Group Start + * + * Start a group call. All subsequent calls to NCCL may not block due to + * inter-CPU synchronization. + */ +ncclResult_t ncclGroupStart(); +ncclResult_t pncclGroupStart(); + +/* + * Group End + * + * End a group call. Wait for all calls since ncclGroupStart to complete + * before returning. + */ +ncclResult_t ncclGroupEnd(); +ncclResult_t pncclGroupEnd(); + +#ifdef __cplusplus +} // end extern "C" +#endif + +#endif // end include guard diff --git a/projects/rccl/tools/topo_expl/include/utils.h b/projects/rccl/tools/topo_expl/include/utils.h new file mode 100644 index 0000000000..26e85c26d6 --- /dev/null +++ b/projects/rccl/tools/topo_expl/include/utils.h @@ -0,0 +1,44 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef UTILS_H_ +#define UTILS_H_ + +struct allGather1Data_t { + struct ncclPeerInfo peerInfo; + struct ncclComm* comm; +}; + +struct allGather3Data_t { + int cudaCompCap; + int fullCudaCompCap; + int nvlink; + int nChannels; + struct { + int sameChannels; + int speedIntra; + int speedInter; + int nvlink; + } tree; + struct { + int sameChannels; + int speedIntra; + int speedInter; + int nvlink; + } ring; + struct ncclTopoRanks topoRanks; +}; + +ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data); + +ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, + struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph); + +ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, + struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph); + +#endif \ No newline at end of file diff --git a/projects/rccl/tools/topo_expl/model.cpp b/projects/rccl/tools/topo_expl/model.cpp new file mode 100644 index 0000000000..9492526dc9 --- /dev/null +++ b/projects/rccl/tools/topo_expl/model.cpp @@ -0,0 +1,216 @@ +/* +Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "nccl.h" +#include "channel.h" +#include "nvmlwrap.h" +#include "bootstrap.h" +#include "transport.h" +#include "group.h" +#include "net.h" +#include "graph.h" +#include "argcheck.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "model.h" + +extern NodeModel *node_model; + +static ncclResult_t dummyNetDevices(int* ndev) { + *ndev = node_model->getnNetDevs(); + return ncclSuccess; +} + +static ncclResult_t dummyNetPciPath(int dev, char** path) { + node_model->getNetPciPath(dev, path); + return ncclSuccess; +} + +ncclNet_t ncclNetDummy = { + "IB", + 0, + dummyNetDevices, + dummyNetPciPath, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 +}; + +ncclNet_t* ncclNet = &ncclNetDummy; + +ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { + return ncclSuccess; +} + +/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ +int busIdToCudaDev(int64_t busId) { + int cudaDev; + + for (cudaDev = 0; cudaDev < node_model->getnGpus(); cudaDev++) { + if (node_model->getGpuBusId(cudaDev) == busId) + break; + } + + if (cudaDev < node_model->getnGpus()) + return cudaDev; + else + WARN("Invalid busId %lx", busId); + return 0; +} + +/* Determine if two peers can communicate with P2P */ +ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + // Rule out different nodes + *ret = 0; + if (info1->hostHash != info2->hostHash) return ncclSuccess; + int cudaDev1 = busIdToCudaDev(info1->busId); + int cudaDev2 = busIdToCudaDev(info2->busId); + *ret = node_model->p2pCanConnect(cudaDev1, cudaDev2); + return ncclSuccess; +} + +/* Send: Create and return connect structures for this peer to connect to me */ +ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, + struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { + if (myInfo->pidHash == peerInfo->pidHash) { + if (myInfo->cudaDev == peerInfo->cudaDev) { + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + return ncclInternalError; + } else { + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer", + channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + } + } else { + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC", + channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + //TRACE_DUMP_IPC(&info.devIpc); + } + return ncclSuccess; +} + +/* Create and return connect structures for this peer to connect to me */ +ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, + struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) { + return ncclSuccess; +} + +struct ncclTransport p2pTransport = { + "P2P", + p2pCanConnect, + { p2pSendSetup, NULL, NULL, NULL }, + { p2pRecvSetup, NULL, NULL, NULL } +}; + +/* Determine if two peers can communicate with SHM */ +ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + // Rule out different nodes + *ret = 0; + if (info1->hostHash != info2->hostHash) return ncclSuccess; + int cudaDev1 = busIdToCudaDev(info1->busId); + int cudaDev2 = busIdToCudaDev(info2->busId); + *ret = node_model->shmCanConnect(cudaDev1, cudaDev2); + return ncclSuccess; +} + +/* Create and return connect structures for this peer to connect to me */ +ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { + INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + return ncclSuccess; +} + +ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { + return ncclSuccess; +} + +struct ncclTransport shmTransport = { + "SHM", + shmCanConnect, + { shmSendSetup, NULL, NULL, NULL }, + { shmRecvSetup, NULL, NULL, NULL } +}; + +/* Determine if two peers can communicate with NET */ +ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + *ret = node_model->netCanConnect(info1->rank, info2->rank); + return ncclSuccess; +} + +ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { + int netDev, useGdr = 0; + + NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &netDev)); + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), netDev, + useGdr ? "/GDRDMA" : ""); + return ncclSuccess; +} + +NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB); + +ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { + int netDev, useGdr = 0; + + NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &netDev)); + // Check if we are close enough that it makes sense to enable GDR + int netGdrLevel = ncclParamNetGdrLevel(); + int distance; + NCCLCHECK(ncclTopoNetDistance(topo, myInfo->busId, netDev, &distance)); + if (distance >= netGdrLevel) { + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), myInfo->busId, netDev, distance, netGdrLevel); + } + else + useGdr = 1; + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), netDev, + useGdr ? "/GDRDMA" : ""); + return ncclSuccess; +} + +struct ncclTransport netTransport = { + "NET", + netCanConnect, + { netSendSetup, NULL, NULL, NULL }, + { netRecvSetup, NULL, NULL, NULL } +}; + +struct ncclTransport ncclTransports[NTRANSPORTS] = { + p2pTransport, + shmTransport, + netTransport, +}; diff --git a/projects/rccl/tools/topo_expl/topo_expl.cpp b/projects/rccl/tools/topo_expl/topo_expl.cpp new file mode 100644 index 0000000000..2d939bfed3 --- /dev/null +++ b/projects/rccl/tools/topo_expl/topo_expl.cpp @@ -0,0 +1,231 @@ +/* +Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "nccl.h" +#include "channel.h" +#include "nvmlwrap.h" +#include "bootstrap.h" +#include "transport.h" +#include "group.h" +#include "net.h" +#include "graph.h" +#include "argcheck.h" +#include "cpuset.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "model.h" +#include "utils.h" + +NodeModel *node_model; + +char* getCmdOption(char ** begin, char ** end, const std::string & option) { + char ** itr = std::find(begin, end, option); + if (itr != end && ++itr != end) + { + return *itr; + } + return 0; +} + +bool cmdOptionExists(char** begin, char** end, const std::string& option) { + return std::find(begin, end, option) != end; +} + +const char *model_descriptions[] = { + "4 nodes with 8 GPUs PCIe 1 NIC", + "4 nodes with 8 GPUs PCIe 2 NIC", + "2 nodes VEGA20 4P1H", + "4 nodes with 8 VEGA20 GPUs XGMI 4P2H 1 NIC", + "single node gfx908 4P3L", + "single node gfx908 8P6L", + "single node gfx908 8P6L Alt. Connection", + "single node 8 GPUs PCIe on Rome", + "4 nodes 8 GPUs PCIe 2 NICs on Rome", + "3 nodes 8 GPUs PCIe + 1 Rome 8 GPUs PCIe + 2 nodes gfx908 4P3L", + NULL, +}; + +int main(int argc,char* argv[]) +{ + struct ncclComm *comm; + + if (!cmdOptionExists(argv, argv + argc, "-m")) { + printf("Usage: ./topo_expl -m model_id\n"); + printf("List of model_id:\n"); + for (int i = 0; model_descriptions[i] != NULL; i++) + printf(" %d: %s\n", i, model_descriptions[i]); + exit(0); + } + + int model_id = 0; + char *mi = getCmdOption(argv, argv + argc, "-m"); + if (mi) + model_id = atol(mi); + + // CPU, GPU and NIC devices on Skylake + CpuDevices skylake("Skylake", 12, 12, 12); + GpuDevices vg20_pcie(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_pcie); + GpuDevices vg20_4p1h(4, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_4p2h); + GpuDevices vg20_4p2h(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_4p2h); + GpuDevices gfx908_4p3l(4, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l); + GpuDevices gfx908_8p6l(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l); + GpuDevices gfx908_8p6l_1(8, busIds_8, gpuPciPaths_8, gpuPciNumaIds_8, conn_mat_8p6l_1); + NetDevices nic_1(1, netPciPaths_1, netGuids_1, netPciNumaIds_1); + NetDevices nic_1_1(1, netPciPaths_1_1, netGuids_1, netPciNumaIds_1); + NetDevices nic_2(2, netPciPaths_2, netGuids_2, netPciNumaIds_2); + + // CPU, GPU and NIC devices on Rome + CpuDevices rome("Rome", 24, 24, 24); + GpuDevices vg20_pcie_rome(8, rome_busIds_8, rome_gpuPciPaths_8, rome_gpuPciNumaIds_8, conn_mat_rome); + NetDevices nic_1_rome(1, rome_netPciPaths_1, rome_netGuids_1, rom_netPciNumaIds_1); + NetDevices nic_2_rome(2, rome_netPciPaths_2, rome_netGuids_2, rom_netPciNumaIds_2); + + // 8 GPUs PCIe 1 NIC + NodeModel model_8pcie_1nic(skylake, vg20_pcie, nic_1, "Skylake 8 GPUs PCIe"); + + // 8 GPUs PCIe 2 NIC + NodeModel model_8pcie_2nic(skylake, vg20_pcie, nic_2, "Skylake 8 GPUs PCIe 2 NIC"); + + // VEGA20 4P1H, use VEGA20 4P2H model + NodeModel model_vg20_4p1h_1nic(skylake, vg20_4p1h, nic_1, "Skylake VEGA20 4P1H"); + + // VEGA20 GPUs XGMI 4P2H 1 NIC + NodeModel model_vg20_4p2h_1nic(skylake, vg20_4p2h, nic_1_1, "Skylake VEGA20 4P2H"); + + // gfx908 4P3L + NodeModel model_gfx908_4p_1nic(skylake, gfx908_4p3l, nic_1, "Skylake gfx908 4P3L"); + + // gfx908 8P6L + NodeModel model_gfx908_8p_1nic(skylake, gfx908_8p6l, nic_1, "Skylake gfx908 8P6L"); + + // gfx908 8P6L alternative connection + NodeModel model_gfx908_8p_1nic_1(skylake, gfx908_8p6l_1, nic_1, "Skylake gfx908 8P6L Alt. Connection"); + + // 8 GPUs PCIe on Rome + NodeModel model_8pcie_1nic_rome(rome, vg20_pcie_rome, nic_1_rome, "Rome 8 GPUs PCIe"); + + // 8 GPUs PCIe 2 NICs on Rome + NodeModel model_8pcie_2nic_rome(rome, vg20_pcie_rome, nic_2_rome, "Rome 8 GPUs PCIe 2 NICs"); + + NetworkModel network; + + switch(model_id) { + case 0: + for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_1nic); + break; + case 1: + for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_2nic); + break; + case 2: + for (int i = 0; i < 2; i ++) network.AddNode(model_vg20_4p1h_1nic); + break; + case 3: + for (int i = 0; i < 4; i ++) network.AddNode(model_vg20_4p2h_1nic); + break; + case 4: + network.AddNode(model_gfx908_4p_1nic); + break; + case 5: + network.AddNode(model_gfx908_8p_1nic); + break; + case 6: + network.AddNode(model_gfx908_8p_1nic_1); + break; + case 7: + network.AddNode(model_8pcie_1nic_rome); + break; + case 8: + for (int i = 0; i < 4; i ++) network.AddNode(model_8pcie_2nic_rome); + break; + case 9: + for (int i = 0; i < 3; i ++) network.AddNode(model_8pcie_1nic); + network.AddNode(model_8pcie_1nic_rome); + for (int i = 0; i < 2; i ++) network.AddNode(model_gfx908_4p_1nic); + break; + default: + printf("Invalid model_id %d\n", model_id); + exit(0); + } + + printf("Generating topology using %d: %s\n", model_id, model_descriptions[model_id]); + + int nranks = network.GetNRanks(); + int nnodes = network.GetNNodes(); + + printf("nnodes = %d, nranks = %d\n", nnodes, nranks); + for (int i = 0; i < nranks; i++) { + node_model = network.GetNode(i); + assert(node_model!=0); + printf("Rank %d: node %d (%s) GPU busId %lx\n", i, node_model->nodeId, + node_model->description, node_model->getGpuBusId(node_model->rankToCudaDev(i))); + } + + NCCLCHECK(ncclCalloc(&comm, nranks)); + + struct allGather1Data_t *allGather1Data; + NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); + + struct allGather3Data_t *allGather3Data; + NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); + + for (int i = 0; i < nranks; i++) { + comm[i].rank = i; + comm[i].nRanks = nranks; + node_model = network.GetNode(i); + assert(node_model!=0); + bootstrapAllGather(&comm[i], allGather1Data); + } + + struct ncclTopoGraph treeGraph, ringGraph; + + for (int i = 0; i < nranks; i++) { + node_model = network.GetNode(i); + assert(node_model!=0); + initTransportsRank_1(&comm[i], allGather1Data, allGather3Data, treeGraph, ringGraph); + } + + for (int i = 0; i < nranks; i++) { + node_model = network.GetNode(i); + assert(node_model!=0); + initTransportsRank_3(&comm[i], allGather3Data, treeGraph, ringGraph); + } + + free(allGather3Data); + free(allGather1Data); + + free(comm); + printf("Done generating topology using %d: %s\n", model_id, model_descriptions[model_id]); + + return 0; +} \ No newline at end of file diff --git a/projects/rccl/tools/topo_expl/utils.cpp b/projects/rccl/tools/topo_expl/utils.cpp new file mode 100644 index 0000000000..18d61d0b99 --- /dev/null +++ b/projects/rccl/tools/topo_expl/utils.cpp @@ -0,0 +1,420 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl.h" +#include "channel.h" +#include "nvmlwrap.h" +#include "bootstrap.h" +#include "transport.h" +#include "group.h" +#include "net.h" +#include "graph.h" +#include "argcheck.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "model.h" +#include "utils.h" + +extern NodeModel *node_model; + +NCCL_PARAM(CrossNic, "CROSS_NIC", 2); + +// Get current Compute Capability +int ncclCudaCompCap() { + int ccMajor = 1, ccMinor = 0; + return ccMajor*10+ccMinor; +} + +ncclResult_t int64ToBusId(int64_t id, char* busId) { + sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf)); + return ncclSuccess; +} + +ncclResult_t busIdToInt64(char* busId, int64_t* id) { + const int size = strlen(busId); + char* hexStr; + NCCLCHECK(ncclCalloc(&hexStr, size)); + int hexOffset = 0; + for (int i=0; i= '0' && c <= '9') || + (c >= 'A' && c <= 'F') || + (c >= 'a' && c <= 'f')) { + hexStr[hexOffset++] = busId[i]; + } else break; + } + hexStr[hexOffset] = '\0'; + *id = strtol(hexStr, NULL, 16); + free(hexStr); + return ncclSuccess; +} + +int ncclDebugLevel = -1; + +void ncclDebugInit() { + if (ncclDebugLevel != -1) return; + const char* nccl_debug = getenv("NCCL_DEBUG"); + if (nccl_debug == NULL) { + ncclDebugLevel = NCCL_LOG_NONE; + } else if (strcasecmp(nccl_debug, "VERSION") == 0) { + ncclDebugLevel = NCCL_LOG_VERSION; + } else if (strcasecmp(nccl_debug, "WARN") == 0) { + ncclDebugLevel = NCCL_LOG_WARN; + } else if (strcasecmp(nccl_debug, "INFO") == 0) { + ncclDebugLevel = NCCL_LOG_INFO; + } else if (strcasecmp(nccl_debug, "ABORT") == 0) { + ncclDebugLevel = NCCL_LOG_ABORT; + } else if (strcasecmp(nccl_debug, "TRACE") == 0) { + ncclDebugLevel = NCCL_LOG_TRACE; + } +} + +void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { + if (ncclDebugLevel == -1) ncclDebugInit(); + if (level == NCCL_LOG_TRACE && ncclDebugLevel != NCCL_LOG_TRACE) return; + char buffer[1024]; + size_t len; + len = snprintf(buffer, sizeof(buffer), + "[%d:%d] ", node_model->nodeId, node_model->currRank); + va_list args; + va_start(args, fmt); + vsprintf(buffer+len, fmt, args); + va_end(args); + printf("%s\n", buffer); + if (level == NCCL_LOG_WARN) { + fprintf(stderr,"[%d:%d] %s:%d TOPO EXPL ABORT\n", + node_model->nodeId, node_model->currRank, filefunc, line); + abort(); + } +} + +ncclResult_t bootstrapAllGather(struct ncclComm* comm, struct allGather1Data_t * allGather1Data) { + // AllGather1 - begin + allGather1Data[comm->rank].peerInfo.rank = comm->rank; + allGather1Data[comm->rank].peerInfo.cudaDev = node_model->rankToCudaDev(comm->rank); + allGather1Data[comm->rank].peerInfo.gdrSupport = 1; + allGather1Data[comm->rank].peerInfo.hostHash = node_model->hostHash; + allGather1Data[comm->rank].peerInfo.pidHash = node_model->pidHash; + allGather1Data[comm->rank].peerInfo.shmDev = 0x19; + allGather1Data[comm->rank].peerInfo.busId = node_model->getGpuBusId(node_model->rankToCudaDev(comm->rank)); + return ncclSuccess; +} + +ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, + struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph) { + // We use 3 AllGathers + // 1. { peerInfo, comm } + // 2. ConnectTransport[nranks], ConnectValue[nranks] + // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] } + + int rank = comm->rank; + int nranks = comm->nRanks; + //uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES); + //TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); + //NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); + + // AllGather1 - begin + //struct allGather1Data_t *allGather1Data; + //NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); + //allGather1Data[rank].comm = comm; + struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo; + //NCCLCHECK(fillInfo(comm, myInfo, commHash)); + //NCCLCHECK(bootstrapAllGather(comm, allGather1Data)); + + NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks)); + for (int i = 0; i < nranks; i++) { + memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); + if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) { + WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId); + return ncclInvalidUsage; + } + } + // AllGather1 data is used again below + // AllGather1 - end + + // Topo detection / System graph creation + NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo)); + // Compute paths between GPUs and NICs + NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); + // Remove inaccessible GPUs and unused NICs + NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm)); + // Recompute paths after trimming + NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); + // Compute max speed to accelerate search + NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo)); + // Print final topology + NCCLCHECK(ncclTopoPrint(comm->topo)); + + // Get rings and trees + //struct ncclTopoGraph treeGraph; + treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE; + treeGraph.crossNic = ncclParamCrossNic(); + // We communicate only half the data between node with trees on 2 nodes. + NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph)); + NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph)); + //struct ncclTopoGraph ringGraph; + ringGraph.pattern = NCCL_TOPO_PATTERN_RING; + ringGraph.crossNic = ncclParamCrossNic(); + NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph)); + NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph)); + + // AllGather3 - begin + allGather3Data[rank].cudaCompCap = ncclCudaCompCap(); + allGather3Data[rank].nvlink = treeGraph.nvlink; + allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); + allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; + allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra; + allGather3Data[rank].tree.speedInter = treeGraph.speedInter; + allGather3Data[rank].tree.nvlink = treeGraph.nvlink; + allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels; + allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra; + allGather3Data[rank].ring.speedInter = ringGraph.speedInter; + allGather3Data[rank].ring.nvlink = ringGraph.nvlink; + + NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks)); + //INFO(NCCL_GRAPH, "%d: nvlink %d nChannels %d tree.sameChannels %d tree.speedIntra %d tree.speedInter %d tree.nvlink %d ring.sameChannels %d ring.speedIntra %d ring.speedInter %d ring.nvlink %d", + // rank, allGather3Data[rank].nvlink, allGather3Data[rank].nChannels, allGather3Data[rank].tree.sameChannels, allGather3Data[rank].tree.speedIntra, allGather3Data[rank].tree.speedInter, allGather3Data[rank].tree.nvlink, + // allGather3Data[rank].ring.sameChannels, allGather3Data[rank].ring.speedIntra, allGather3Data[rank].ring.speedInter, allGather3Data[rank].ring.nvlink); + //INFO(NCCL_GRAPH, " ringRecv %d ringSend %d ringPrev %d ringNext %d treeUpRecv %d treeUpSend %d treeDnRecv %d treeDnSend %d", + // allGather3Data[rank].topoRanks.ringRecv[0], allGather3Data[rank].topoRanks.ringSend[0], allGather3Data[rank].topoRanks.ringPrev[0], allGather3Data[rank].topoRanks.ringNext[0], + // allGather3Data[rank].topoRanks.treeUpRecv[0], allGather3Data[rank].topoRanks.treeUpSend[0], allGather3Data[rank].topoRanks.treeDnRecv[0], allGather3Data[rank].topoRanks.treeDnSend[0]); + return ncclSuccess; +} + +template +static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) { + for (int t=0; tsend : &transport->recv; + int ret = 0; + NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo)); + if (ret) { + //cpu_set_t affinitySave; + //sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); + //int cudaDev; + //CUDACHECK(hipGetDevice(&cudaDev)); + //setCpuAffinity(cudaDev); + connector->transportComm = transportComm; + NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId)); + //sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); + return ncclSuccess; + } + } + WARN("No transport found !"); + return ncclInternalError; +} + +static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) { + TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); + uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */ + struct ncclConnect connect; + struct ncclConnector* conn; + for (int i=0; ipeers[peer].recv; + if (conn->connected) { ++nSkippedRecv; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); + //NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + } + for (int i=0; ipeers[peer].send; + if (conn->connected) { ++nSkippedSend; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); + //NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + } + for (int i=0; ipeers[peer].send; + if (conn->connected) {++nSkippedSend; continue; } + memset(&connect, 0, sizeof(connect)); + //NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + //NCCLCHECK(conn->transportComm->connect(&connect, conn)); + conn->connected = 1; + } + for (int i=0; ipeers[peer].recv; + if (conn->connected) {++nSkippedRecv; continue; } + memset(&connect, 0, sizeof(connect)); + //CCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + //NCCLCHECK(conn->transportComm->connect(&connect, conn)); + conn->connected = 1; + } + TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv); + return ncclSuccess; +} + +ncclResult_t initChannel(struct ncclComm* comm, int channelid) { + struct ncclChannel* channel = comm->channels+channelid; + channel->id = channelid; + + // Setup intermediate buffering + //channel->buffSize = ncclParamBuffsize(); + + // Ring index to user rank table. + //NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks)); + NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks)); + + // Communication structures with peers. + //NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks)); + NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks)); + for (size_t i=0; inRanks; ++i) { + channel->peers[i].send.comm = comm; + channel->peers[i].recv.comm = comm; + } + + // Per-channel operation list. + //NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS)); + return ncclSuccess; +} + +static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) { + TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); + NCCLCHECK(initChannel(comm, channelId)); + + struct ncclRing* ring = &comm->channels[channelId].ring; + // Reorganize ranks to start with rank. + int shift; + for (shift = 0; shiftuserRanks[i] = ringRanks[(i+shift)%nranks]; + } + return ncclSuccess; +} + +ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, + struct ncclTopoGraph& ringGraph) { + int rank = comm->rank; + int nranks = comm->nRanks; + //NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data))); + + // Determine nNodes, firstRanks, ... + int* nodesFirstRank; + NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks)); + for (int i=0; inNodes; n++) { + if (nodesFirstRank[n] == firstRank) node = n; + } + if (node == -1) { + node = comm->nNodes++; + nodesFirstRank[node] = firstRank; + } + if (i == comm->rank) comm->node = node; + } + + // Determine the minimum CUDA Compute capability of all GPUs + int myCompCap = allGather3Data[rank].cudaCompCap; + int minCompCap = myCompCap, maxCompCap = myCompCap; + for (int i = 0; i < nranks; i++) { + minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap); + maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap); + } + + comm->nvlink = 1; + for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink; + + int nChannelsOrig = comm->nChannels; + struct ncclTopoRanks** allTopoRanks; + NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks)); + for (int i=0; inChannels = std::min(allGather3Data[i].nChannels, comm->nChannels); + treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); + treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra); + treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter); + treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink); + ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); + ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra); + ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter); + ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink); + } + + if (comm->nChannels < nChannelsOrig) { + // We started duplicating channels during Preset(), so we need to move the + // duplicated channels since we have removed some. + for (int i=0; inChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel)); + } + + int *rings; + NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); + + char line[1024]; + sprintf(line, "nodesFirstRank: "); + int offset = strlen(line); + for (int i=0; inNodes; i++) { + sprintf(line+offset, "%d ", nodesFirstRank[i]); + offset = strlen(line); + } + INFO(NCCL_INIT, "%s", line); + + NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings)); + + free(allTopoRanks); + free(nodesFirstRank); + + // AllGather3 - end + + TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels); + + line[0]='\0'; + for (int c=0; cnChannels; c++) { + struct ncclTree* treeUp = &comm->channels[c].treeUp; + struct ncclTree* treeDn = &comm->channels[c].treeDn; + snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d", + c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up, + treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]); + } + line[1023] = '\0'; + INFO(NCCL_INIT, "Trees%s", line); + + free(rings); + + // Done with AllGather1 data + //free(allGather1Data); + + TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); + + // Connect with prev/next for each ring + struct ncclConnect *connect; + NCCLCHECK(ncclCalloc(&connect, 2)); + for (int c=0; cnChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks)); + if (comm->nRanks == 1) continue; + NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next)); + NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up)); + NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down)); + } + TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); + free(connect); + + return ncclSuccess; +}