graph: fix for MI300X 64 GPU case (#1308)

PR #1290 introduced a failure for 64 GPU case on MI300X. This PR
fixes the failure.
This commit is contained in:
Nusrat Islam
2024-08-26 18:37:58 -05:00
committed by GitHub
parent 607e34dd99
commit 833435be18
3 changed files with 4 additions and 4 deletions
+1 -1
View File
@@ -626,7 +626,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
// Only use full MAXCHANNELS for gfx94x
int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ?
((comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20)
((comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 38)
? comm->topo->nodes[GPU].nodes[0].gpu.cu : MAXCHANNELS) : 2*CHANNEL_LIMIT;
if (graphs[NCCL_ALGO_RING]->nIntraChannels > 0 || comm->nNodes > 1) {
+1 -1
View File
@@ -94,7 +94,7 @@ struct ncclTopoLink {
float bw;
struct ncclTopoNode* remNode;
};
#define NCCL_TOPO_MAX_LINKS 64 //Changed the value from 32 to 64 for CPX mode
#define NCCL_TOPO_MAX_LINKS 128 //Changed the value from 32 to 128 for CPX mode
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
+2 -2
View File
@@ -17,8 +17,8 @@
// A few constraints to make the implementation easy
#define MAX_STR_LEN 255
#define MAX_ATTR_COUNT 16
#define MAX_SUBS 64 //Changed the value from 32 to 64 for CPX mode
#define MAX_NODES 4096 //Changed the value from 1024 to 4096 for CPX mode
#define MAX_SUBS 512 //Changed the value from 32 to 512 for CPX mode
#define MAX_NODES 8192 //Changed the value from 1024 to 8192 for CPX mode
#define NODE_TYPE_NONE 0
#define NODE_TYPE_OPEN 1