2020-02-03 22:06:44 +00:00
|
|
|
/*
|
|
|
|
|
Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
|
|
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
|
|
|
in the Software without restriction, including without limitation the rights
|
|
|
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
|
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
|
|
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
|
|
|
all copies or substantial portions of the Software.
|
|
|
|
|
|
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
|
THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "nccl.h"
|
|
|
|
|
#include "channel.h"
|
|
|
|
|
#include "nvmlwrap.h"
|
|
|
|
|
#include "bootstrap.h"
|
|
|
|
|
#include "transport.h"
|
|
|
|
|
#include "group.h"
|
|
|
|
|
#include "net.h"
|
|
|
|
|
#include "graph.h"
|
|
|
|
|
#include "argcheck.h"
|
|
|
|
|
#include <sched.h>
|
|
|
|
|
#include <fcntl.h>
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
#include <hip/hip_runtime.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <errno.h>
|
|
|
|
|
#include <assert.h>
|
|
|
|
|
#include <dlfcn.h>
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
|
#include <sys/stat.h>
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
#include <cstdio>
|
|
|
|
|
#include <iostream>
|
|
|
|
|
#include <cstring>
|
|
|
|
|
#include "model.h"
|
|
|
|
|
#include "utils.h"
|
2020-02-26 14:13:15 -08:00
|
|
|
#include "topo.h"
|
2022-04-26 15:40:07 -07:00
|
|
|
#include "graph.h"
|
2025-04-23 15:44:56 -04:00
|
|
|
#include "rccl_common.h"
|
2020-02-03 22:06:44 +00:00
|
|
|
|
|
|
|
|
NodeModel *node_model;
|
2022-09-09 01:20:52 +00:00
|
|
|
extern ncclNet_t* ncclNet;
|
|
|
|
|
|
2025-02-07 08:44:04 -07:00
|
|
|
int64_t ncclParamWorkArgsBytes() { return INT64_MAX; }
|
2020-02-03 22:06:44 +00:00
|
|
|
|
|
|
|
|
char* getCmdOption(char ** begin, char ** end, const std::string & option) {
|
|
|
|
|
char ** itr = std::find(begin, end, option);
|
|
|
|
|
if (itr != end && ++itr != end)
|
|
|
|
|
{
|
|
|
|
|
return *itr;
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool cmdOptionExists(char** begin, char** end, const std::string& option) {
|
|
|
|
|
return std::find(begin, end, option) != end;
|
|
|
|
|
}
|
|
|
|
|
|
2020-07-22 16:05:53 +00:00
|
|
|
typedef struct NodeModelDesc {
|
|
|
|
|
const char *filename;
|
|
|
|
|
const char *description;
|
|
|
|
|
} NodeModelDesc;
|
|
|
|
|
|
|
|
|
|
NodeModelDesc model_descs[] = {
|
2025-02-07 08:44:04 -07:00
|
|
|
// GFX 906
|
|
|
|
|
{"topo_4p1h.xml", " 4gfx906 1H2XGMI 1NIC 1Intel A"},
|
|
|
|
|
{"topo_4p1h_1.xml", " 4gfx906 1H2XGMI 2NIC 2Intel A"},
|
|
|
|
|
{"topo_8p_rome.xml", " 8gfx906 2H2XGMI 1NIC 2AMD A"},
|
|
|
|
|
{"topo_8p_rome_n2.xml", " 8gfx906 2H2XGMI 1NIC 4AMD A"},
|
|
|
|
|
{"topo_8p_rome_n4.xml", " 8gfx906 2H2XGMI 1NIC 7AMD A"},
|
|
|
|
|
{"topo_4p2h.xml", " 8gfx906 2H2XGMI 1NIC 1Intel A"},
|
|
|
|
|
{"topo_4p2h_1.xml", " 8gfx906 2H2XGMI 1NIC 1Intel B"},
|
|
|
|
|
{"topo_4p2h_2nic.xml", " 8gfx906 2H2XGMI 2NIC 1Intel A"},
|
|
|
|
|
{"topo_8p_rome_n2_1.xml", " 8gfx906 2H2XGMI 2NIC 4AMD A"},
|
|
|
|
|
{"topo_8p_rome_n2_2.xml", " 8gfx906 2H2XGMI 2NIC 4AMD B"},
|
|
|
|
|
{"topo_8p_ts1.xml", " 8gfx906 2H2XGMI 2NIC 4AMD C"},
|
|
|
|
|
{"topo_8p_ts1_1.xml", " 8gfx906 2H2XGMI 2NIC 4AMD D"},
|
|
|
|
|
{"topo_8p_ts1_n4.xml", " 8gfx906 2H2XGMI 2NIC 8AMD A"},
|
|
|
|
|
{"topo_8p_ts1_n4_1.xml", " 8gfx906 2H2XGMI 2NIC 8AMD B"},
|
|
|
|
|
{"topo_8p_ts1_n4_2.xml", " 8gfx906 2H2XGMI 3NIC 8AMD C"},
|
|
|
|
|
{"topo_8p_pcie.xml", " 8gfx906 PCIe 1NIC 1Intel A"},
|
|
|
|
|
{"topo_8p_pcie_1.xml", " 8gfx906 PCIe 1NIC 1Intel B"},
|
|
|
|
|
{"topo_8p_pcie_2nic.xml", " 8gfx906 PCIe 2NIC 1Intel A"},
|
|
|
|
|
{"topo_8p_rome_pcie.xml", " 8gfx906 PCIe 2NIC 2AMD2 A"},
|
|
|
|
|
// GFX 908
|
|
|
|
|
{"topo_4p3l.xml", " 4gfx908 1H3XGMI 2NIC 1Intel A"},
|
|
|
|
|
{"topo_8p6l.xml", " 8gfx908 1H6XGMI 1NIC 2AMD A"},
|
|
|
|
|
{"topo_8p6l_1nic.xml", " 8gfx908 1H6XGMI 1NIC 2AMD B"},
|
|
|
|
|
{"topo_8p6l_2nic.xml", " 8gfx908 1H6XGMI 2NIC 2AMD A"},
|
|
|
|
|
{"topo_8p6l_3nic.xml", " 8gfx908 1H6XGMI 3NIC 2AMD A"},
|
|
|
|
|
{"topo_8p6l_4nic.xml", " 8gfx908 1H6XGMI 4NIC 2AMD A"},
|
|
|
|
|
{"topo_8p6l_5nic.xml", " 8gfx908 1H6XGMI 5NIC 2AMD A"},
|
|
|
|
|
{"topo_8p6l_6nic.xml", " 8gfx908 1H6XGMI 6NIC 2AMD A"},
|
|
|
|
|
{"topo_4p3l_ia.xml", " 8gfx908 2H3XGMI 1NIC 1Intel A"},
|
|
|
|
|
{"topo_4p3l_2h.xml", " 8gfx908 2H3XGMI 1NIC 4AMD A"},
|
|
|
|
|
{"topo_4p3l_n2.xml", " 8gfx908 2H3XGMI 1NIC 4AMD B"},
|
|
|
|
|
{"topo_4p3l_n2_1.xml", " 8gfx908 2H3XGMI 1NIC 4AMD C"},
|
|
|
|
|
{"topo_collnet_n1.xml", " 8gfx908 2H3XGMI 1NIC 4AMD D"},
|
|
|
|
|
{"topo_8p_rome_vm1.xml", " 8gfx908 2H3XGMI 1NIC 4AMD E"},
|
|
|
|
|
{"topo_4p3l_n4.xml", " 8gfx908 2H3XGMI 1NIC 7AMD A"},
|
|
|
|
|
{"topo_8p_rome_n4_1.xml", " 8gfx908 2H3XGMI 1NIC 7AMD B"},
|
|
|
|
|
{"topo_8p_rome_4nics.xml", " 8gfx908 2H3XGMI 4NIC 4AMD A"},
|
|
|
|
|
{"topo_collnet_n4.xml", " 8gfx908 2H3XGMI 4NIC 4AMD B"},
|
|
|
|
|
{"topo_8p_rome_4n_1.xml", " 8gfx908 2H3XGMI 4NIC 4AMD C"},
|
|
|
|
|
{"topo_8p_rome_4n_2.xml", " 8gfx908 2H3XGMI 4NIC 4AMD D"},
|
|
|
|
|
{"topo_8p_4nics.xml", " 8gfx908 2H3XGMI 4NIC 4AMD E"},
|
|
|
|
|
{"topo_4p4h.xml", "16gfx908 2H3XGMI 16NIC 1AMD A"},
|
|
|
|
|
// GFX 910
|
|
|
|
|
{"topo_3p_pcie.xml", " 3gfx910 PCIe 1NIC 2AMD A"},
|
|
|
|
|
{"topo_3p_pcie_1.xml", " 3gfx910 PCIe 1NIC 2AMD B"},
|
|
|
|
|
{"topo_8p_90a.xml", " 8gfx910 2H3XGMI 1NIC 1AMD A"},
|
|
|
|
|
{"topo_8p_90a_1.xml", " 8gfx910 2H3XGMI 1NIC 3AMD A"},
|
|
|
|
|
{"topo_8p1h_2.xml", " 8gfx910 2H3XGMI 2NIC 4AMD A"},
|
|
|
|
|
{"topo_8p1h.xml", " 8gfx910 2H3XGMI 4NIC 2AMD A"},
|
|
|
|
|
{"topo_8p1h_n1.xml", " 8gfx910 2H3XGMI 4NIC 2AMD B"},
|
|
|
|
|
{"topo_8p1h_1.xml", " 8gfx910 2H3XGMI 4NIC 2AMD C"},
|
|
|
|
|
{"topo_8p1h_3.xml", " 8gfx910 2H3XGMI 4NIC 4AMD A"},
|
|
|
|
|
{"topo_8p1h_4.xml", " 8gfx910 2H3XGMI 8NIC 2AMD A"},
|
|
|
|
|
{"topo_8p1h_5.xml", " 8gfx910 2H3XGMI 8NIC 2AMD B"},
|
|
|
|
|
{"topo_16p1h.xml", "16gfx910 2H3XGMI 8NIC 4AMD A"},
|
|
|
|
|
{"topo_16p1h_vm.xml", "16gfx910 2H3XGMI 8NIC 4AMD B"},
|
|
|
|
|
// GFX 942
|
2025-03-20 09:34:53 -06:00
|
|
|
{"topo_4p_942.xml", " 4gfx942 1H3XGMI 4NIC 4AMD2 A"},
|
|
|
|
|
{"topo_8p_942.xml", " 8gfx942 1H7XGMI 8NIC 2Intel A"},
|
|
|
|
|
{"topo_8p_942vm.xml", " 8gfx942 1H7XGMI 8NIC 2Intel B"},
|
2025-02-07 08:44:04 -07:00
|
|
|
{"topo_16p_gio-1s-1rp-cascade.xml", "16gfx942 2H7XGMI 1NIC 2AMD A"},
|
|
|
|
|
{"topo_16p_gio-3s-1rp-split-flat.xml", "16gfx942 2H7XGMI 1NIC 2AMD B"},
|
2025-08-26 10:11:38 -04:00
|
|
|
// GFX 950
|
|
|
|
|
{"topo_8p_950.xml", " 8gfx950 1H7XGMI 8NIC 2AMD A"},
|
2020-02-03 22:06:44 +00:00
|
|
|
};
|
|
|
|
|
|
2023-06-21 20:54:24 -07:00
|
|
|
NCCL_PARAM(MaxCTAs, "MAX_CTAS", MAXCHANNELS);
|
|
|
|
|
NCCL_PARAM(MinCTAs, "MIN_CTAS", 1);
|
|
|
|
|
|
2020-02-03 22:06:44 +00:00
|
|
|
int main(int argc,char* argv[])
|
|
|
|
|
{
|
|
|
|
|
struct ncclComm *comm;
|
2020-07-22 16:05:53 +00:00
|
|
|
const int num_models = sizeof(model_descs) / sizeof(*model_descs);
|
2023-06-21 20:54:24 -07:00
|
|
|
int minCTAsEnv;
|
|
|
|
|
int maxCTAsEnv;
|
2020-02-03 22:06:44 +00:00
|
|
|
|
|
|
|
|
if (!cmdOptionExists(argv, argv + argc, "-m")) {
|
2024-04-04 15:11:47 -06:00
|
|
|
printf("Usage: ./topo_expl -m model_id [-n numNodes=1]\n");
|
2020-02-03 22:06:44 +00:00
|
|
|
printf("List of model_id:\n");
|
2020-07-22 16:05:53 +00:00
|
|
|
for (int i = 0; i < num_models; i++)
|
2025-02-20 15:18:29 -07:00
|
|
|
printf(" %2d: %24s [%s]\n", i, model_descs[i].description, model_descs[i].filename);
|
2020-02-03 22:06:44 +00:00
|
|
|
exit(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int model_id = 0;
|
|
|
|
|
char *mi = getCmdOption(argv, argv + argc, "-m");
|
|
|
|
|
if (mi)
|
|
|
|
|
model_id = atol(mi);
|
|
|
|
|
|
2020-07-22 16:05:53 +00:00
|
|
|
if (model_id >= num_models) {
|
|
|
|
|
printf("Invalid model_id %d\n", model_id);
|
|
|
|
|
exit(0);
|
|
|
|
|
}
|
|
|
|
|
|
2020-02-03 22:06:44 +00:00
|
|
|
NetworkModel network;
|
2020-03-03 11:42:40 -08:00
|
|
|
NodeModel* node;
|
2020-02-03 22:06:44 +00:00
|
|
|
|
2021-03-25 20:59:32 -07:00
|
|
|
initCollNet();
|
|
|
|
|
|
2020-07-22 16:05:53 +00:00
|
|
|
NodeModelDesc *desc = &model_descs[model_id];
|
2025-02-07 08:44:04 -07:00
|
|
|
int numNodes = 1;
|
2024-04-04 15:11:47 -06:00
|
|
|
if (cmdOptionExists(argv, argv + argc, "-n")) {
|
|
|
|
|
char *numNodesStr = getCmdOption(argv, argv + argc, "-n");
|
|
|
|
|
if (numNodesStr)
|
|
|
|
|
numNodes = atol(numNodesStr);
|
|
|
|
|
}
|
|
|
|
|
for (int i=0; i < numNodes; i++) {
|
2020-07-22 16:05:53 +00:00
|
|
|
node = new NodeModel(desc->filename);
|
2020-03-03 11:42:40 -08:00
|
|
|
network.AddNode(node);
|
2020-02-03 22:06:44 +00:00
|
|
|
}
|
|
|
|
|
|
2020-07-22 16:05:53 +00:00
|
|
|
printf("Generating topology using %d: %s\n", model_id, desc->description);
|
2020-02-03 22:06:44 +00:00
|
|
|
|
|
|
|
|
int nranks = network.GetNRanks();
|
|
|
|
|
int nnodes = network.GetNNodes();
|
|
|
|
|
|
|
|
|
|
printf("nnodes = %d, nranks = %d\n", nnodes, nranks);
|
|
|
|
|
for (int i = 0; i < nranks; i++) {
|
|
|
|
|
node_model = network.GetNode(i);
|
|
|
|
|
assert(node_model!=0);
|
2020-03-03 11:42:40 -08:00
|
|
|
printf("Rank %d: node %d cudaDev %d GPU busId %lx\n", i, node_model->nodeId,
|
|
|
|
|
node_model->rankToCudaDev(i), node_model->getGpuBusId(i));
|
2020-02-03 22:06:44 +00:00
|
|
|
}
|
|
|
|
|
|
2023-06-21 20:54:24 -07:00
|
|
|
minCTAsEnv = ncclParamMinCTAs();
|
|
|
|
|
maxCTAsEnv = ncclParamMaxCTAs();
|
|
|
|
|
|
2020-02-03 22:06:44 +00:00
|
|
|
NCCLCHECK(ncclCalloc(&comm, nranks));
|
|
|
|
|
|
2022-04-18 11:14:51 -07:00
|
|
|
struct ncclPeerInfo *peerInfo;
|
|
|
|
|
NCCLCHECK(ncclCalloc(&peerInfo, nranks+1)); // Extra rank to represent CollNet root
|
2020-02-03 22:06:44 +00:00
|
|
|
|
2023-06-21 20:54:24 -07:00
|
|
|
struct allGatherInfo* allGather3Data;
|
2020-02-03 22:06:44 +00:00
|
|
|
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
|
|
|
|
|
|
2023-06-21 20:54:24 -07:00
|
|
|
struct ncclTopoGraph *treeGraph, *ringGraph, *collNetGraph, *nvlsGraph;
|
2022-04-18 11:14:51 -07:00
|
|
|
NCCLCHECK(ncclCalloc(&treeGraph, nranks));
|
|
|
|
|
NCCLCHECK(ncclCalloc(&ringGraph, nranks));
|
|
|
|
|
NCCLCHECK(ncclCalloc(&collNetGraph, nranks));
|
2023-06-21 20:54:24 -07:00
|
|
|
NCCLCHECK(ncclCalloc(&nvlsGraph, nranks));
|
2022-04-18 11:14:51 -07:00
|
|
|
|
2020-02-03 22:06:44 +00:00
|
|
|
for (int i = 0; i < nranks; i++) {
|
|
|
|
|
comm[i].rank = i;
|
|
|
|
|
comm[i].nRanks = nranks;
|
2021-07-27 08:30:08 -07:00
|
|
|
NCCLCHECK(ncclCalloc(&comm[i].connectSend, NCCL_MAX_CONNS*comm->nRanks));
|
|
|
|
|
NCCLCHECK(ncclCalloc(&comm[i].connectRecv, NCCL_MAX_CONNS*comm->nRanks));
|
2020-02-03 22:06:44 +00:00
|
|
|
node_model = network.GetNode(i);
|
|
|
|
|
assert(node_model!=0);
|
2022-04-18 11:14:51 -07:00
|
|
|
comm[i].busId = node_model->getGpuBusId(i);
|
2020-03-03 11:42:40 -08:00
|
|
|
comm[i].topo = node_model->getSystem(i);
|
2022-04-18 11:14:51 -07:00
|
|
|
comm[i].peerInfo = peerInfo;
|
2022-09-09 01:20:52 +00:00
|
|
|
comm[i].ncclNet = ncclNet;
|
2023-06-21 20:54:24 -07:00
|
|
|
comm[i].config.maxCTAs = maxCTAsEnv;
|
|
|
|
|
comm[i].config.minCTAs = minCTAsEnv;
|
|
|
|
|
if (comm[i].topParentRanks == NULL) {
|
|
|
|
|
NCCLCHECK(ncclCalloc(&comm[i].topParentRanks, comm->nRanks));
|
|
|
|
|
for (int j = 0; j < comm->nRanks; ++j)
|
|
|
|
|
comm[i].topParentRanks[j] = j;
|
|
|
|
|
}
|
|
|
|
|
struct ncclSharedResources* sharedRes = NULL;
|
|
|
|
|
NCCLCHECK(ncclCalloc(&sharedRes, 1));
|
|
|
|
|
/* most of attributes are assigned later in initTransportsRank(). */
|
|
|
|
|
sharedRes->owner = &comm[i];
|
|
|
|
|
sharedRes->tpNRanks = comm[i].nRanks;
|
|
|
|
|
NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm[i].nRanks));
|
|
|
|
|
comm[i].sharedRes = sharedRes;
|
|
|
|
|
sharedRes->refCount = 1;
|
|
|
|
|
ncclMemoryStackConstruct(&comm[i].memPermanent);
|
|
|
|
|
// Mark channels as non initialized.
|
2021-04-30 16:57:36 -07:00
|
|
|
for (int c=0; c<MAXCHANNELS; c++) comm[i].channels[c].id = -1;
|
2022-04-18 11:14:51 -07:00
|
|
|
NCCLCHECK(fillInfo(&comm[i], comm[i].peerInfo+comm[i].rank, 0));
|
2020-02-03 22:06:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < nranks; i++) {
|
|
|
|
|
node_model = network.GetNode(i);
|
|
|
|
|
assert(node_model!=0);
|
2023-06-21 20:54:24 -07:00
|
|
|
initTransportsRank_1(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i], nvlsGraph[i]);
|
2020-02-03 22:06:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < nranks; i++) {
|
|
|
|
|
node_model = network.GetNode(i);
|
|
|
|
|
assert(node_model!=0);
|
2023-06-21 20:54:24 -07:00
|
|
|
initTransportsRank_3(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i], nvlsGraph[i]);
|
2025-06-16 15:18:35 +03:00
|
|
|
CUDACHECK(hipDeviceGetAttribute(&comm[i].WarpSize, hipDeviceAttributeWarpSize, comm[i].cudaDev));
|
2020-02-03 22:06:44 +00:00
|
|
|
}
|
2022-04-26 15:40:07 -07:00
|
|
|
for (uint64_t len = 8; len <= 4294967296L; len *= 2) {
|
|
|
|
|
struct ncclInfo info;
|
|
|
|
|
float minTime = 3600000000.0;
|
|
|
|
|
info.comm = &comm[0];
|
2025-06-16 15:18:35 +03:00
|
|
|
|
2022-04-26 15:40:07 -07:00
|
|
|
info.coll = ncclFuncAllReduce;
|
|
|
|
|
// Find algorithm / protocol.
|
2025-02-07 08:44:04 -07:00
|
|
|
int algorithm = -1;
|
|
|
|
|
int protocol = -1;
|
2022-04-26 15:40:07 -07:00
|
|
|
int nAlgos = NCCL_NUM_ALGORITHMS;
|
|
|
|
|
for (int a=0; a<nAlgos; a++) {
|
|
|
|
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
|
|
|
|
float time;
|
2025-05-05 15:26:29 -04:00
|
|
|
NCCLCHECK(ncclTopoGetAlgoTime(info.comm, info.coll, a, p, len, 1, &time));
|
2022-04-26 15:40:07 -07:00
|
|
|
if (time >= 0 && time < minTime) {
|
2025-02-07 08:44:04 -07:00
|
|
|
algorithm = a;
|
|
|
|
|
protocol = p;
|
2022-04-26 15:40:07 -07:00
|
|
|
minTime = time;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-02-07 08:44:04 -07:00
|
|
|
if (algorithm == -1 || protocol == -1) {
|
2022-04-26 15:40:07 -07:00
|
|
|
WARN("Error : no algorithm/protocol available");
|
|
|
|
|
return ncclInternalError;
|
|
|
|
|
}
|
2025-02-07 08:44:04 -07:00
|
|
|
INFO(NCCL_TUNING, "%10ld %s %s time %f", len, ncclAlgoStr[algorithm], ncclProtoStr[protocol], minTime);
|
2022-04-26 15:40:07 -07:00
|
|
|
}
|
|
|
|
|
|
2025-04-23 15:44:56 -04:00
|
|
|
// Arrays to store function types for ncclFuncAllReduce, ReduceScatter, and AllGather
|
|
|
|
|
std::vector<ncclFunc_t> ncclFuncTypes = {
|
|
|
|
|
ncclFuncAllReduce,
|
|
|
|
|
ncclFuncReduceScatter,
|
2025-09-10 14:25:23 -05:00
|
|
|
ncclFuncAllGather,
|
|
|
|
|
ncclFuncReduce,
|
|
|
|
|
ncclFuncBroadcast
|
2025-04-23 15:44:56 -04:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
std::cout << "Running fp32 production choices for algorithm/protocol/maxChannels" << std::endl;
|
|
|
|
|
// RCCL tuning results
|
|
|
|
|
printf("| %-15s | %-15s | %-15s | %-10s | %-10s | %-12s |\n", "Max Size(B)", "Count", "Collective", "Algorithm", "Protocol", "Max Channels");
|
|
|
|
|
printf("|-----------------|-----------------|-----------------|------------|------------|--------------|\n");
|
|
|
|
|
for(int i = 0; i < ncclFuncTypes.size(); ++i) {
|
|
|
|
|
for (uint64_t count = 8; count <= 1073741824L; count *= 2) { // Up to 1 gigabyte
|
|
|
|
|
int algo, proto, nChannels;
|
|
|
|
|
NCCLCHECK(rcclGetAlgoInfo(&comm[0], ncclFuncTypes[i], count, ncclFloat32 , 0, 0, 1, &algo, &proto, &nChannels));
|
|
|
|
|
uint64_t maxCount;
|
|
|
|
|
NCCLCHECK(rcclFuncMaxSendRecvCount(ncclFuncTypes[i], comm[0].nRanks, count, maxCount));
|
|
|
|
|
printf("| %-15ld | %-15ld | %-15s | %-10s | %-10s | %-12d |\n",
|
|
|
|
|
maxCount * sizeof(float),
|
|
|
|
|
count,
|
|
|
|
|
ncclFuncStr[ncclFuncTypes[i]],
|
|
|
|
|
ncclAlgoStr[algo],
|
|
|
|
|
ncclProtoStr[proto],
|
|
|
|
|
nChannels);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2020-12-01 11:33:47 -05:00
|
|
|
for (int i = 0; i < nranks; i++) {
|
|
|
|
|
free(comm[i].connectSend);
|
|
|
|
|
free(comm[i].connectRecv);
|
|
|
|
|
}
|
|
|
|
|
|
2020-07-01 15:11:02 -07:00
|
|
|
free(treeGraph);
|
|
|
|
|
free(ringGraph);
|
|
|
|
|
free(collNetGraph);
|
2020-02-03 22:06:44 +00:00
|
|
|
free(allGather3Data);
|
2022-04-18 11:14:51 -07:00
|
|
|
free(peerInfo);
|
2020-02-03 22:06:44 +00:00
|
|
|
|
|
|
|
|
free(comm);
|
2020-07-22 16:05:53 +00:00
|
|
|
printf("Done generating topology using %d: %s\n", model_id, desc->description);
|
2020-02-03 22:06:44 +00:00
|
|
|
|
|
|
|
|
return 0;
|
2020-07-22 16:05:53 +00:00
|
|
|
}
|