graph: fix for MI300X 64 GPU case (#1308)
PR #1290 introduced a failure for 64 GPU case on MI300X. This PR fixes the failure.
This commit is contained in:
@@ -626,7 +626,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
|
||||
// Only use full MAXCHANNELS for gfx94x
|
||||
int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ?
|
||||
((comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20)
|
||||
((comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 38)
|
||||
? comm->topo->nodes[GPU].nodes[0].gpu.cu : MAXCHANNELS) : 2*CHANNEL_LIMIT;
|
||||
|
||||
if (graphs[NCCL_ALGO_RING]->nIntraChannels > 0 || comm->nNodes > 1) {
|
||||
|
||||
+1
-1
@@ -94,7 +94,7 @@ struct ncclTopoLink {
|
||||
float bw;
|
||||
struct ncclTopoNode* remNode;
|
||||
};
|
||||
#define NCCL_TOPO_MAX_LINKS 64 //Changed the value from 32 to 64 for CPX mode
|
||||
#define NCCL_TOPO_MAX_LINKS 128 //Changed the value from 32 to 128 for CPX mode
|
||||
|
||||
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
|
||||
|
||||
|
||||
+2
-2
@@ -17,8 +17,8 @@
|
||||
// A few constraints to make the implementation easy
|
||||
#define MAX_STR_LEN 255
|
||||
#define MAX_ATTR_COUNT 16
|
||||
#define MAX_SUBS 64 //Changed the value from 32 to 64 for CPX mode
|
||||
#define MAX_NODES 4096 //Changed the value from 1024 to 4096 for CPX mode
|
||||
#define MAX_SUBS 512 //Changed the value from 32 to 512 for CPX mode
|
||||
#define MAX_NODES 8192 //Changed the value from 1024 to 8192 for CPX mode
|
||||
|
||||
#define NODE_TYPE_NONE 0
|
||||
#define NODE_TYPE_OPEN 1
|
||||
|
||||
Reference in New Issue
Block a user