Changing channel stride for MI300X multinode (#1196)

* Shuffling MI300X multi-node channels
* Updating tree channel logic

[ROCm/rccl commit: 0948eecbba]
This commit is contained in:
gilbertlee-amd
2024-06-03 10:00:55 -06:00
committed by GitHub
parent 072c378bc0
commit a009e43f3c
2 changed files with 180 additions and 8 deletions
+5 -2
View File
@@ -281,7 +281,10 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
}
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
const int nChannels = (comm->nChannels > MAXCHANNELS/2) ? comm->nChannels/2 : comm->nChannels, nNodes = comm->nNodes, node = comm->node;
const int channelLimit = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? MAXCHANNELS/2 : 16;
const int nChannels = (comm->nChannels > channelLimit) ? comm->nChannels / 2 : comm->nChannels;
const int nNodes = comm->nNodes, node = comm->node;
// Compute tree depth. Not an exact value but a good approximation in most
// cases
@@ -290,7 +293,7 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int*
int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
int* ttp, *ttc0, *ttc1;
NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
if (comm->nChannels <= IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? (MAXCHANNELS/2) : (MAXCHANNELS/4)) {
if (nChannels == comm->nChannels) {
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
+175 -6
View File
@@ -603,14 +603,70 @@ static struct rcclRomeModel rome_model_81 = {
"N5 5 4 7 1 3 2 6 0 N0|"
"N6 6 3 1 4 0 7 5 2 N2|"
"N7 7 2 0 6 4 1 5 3 N3|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N2 2 5 0 3 6 1 7 4 N4|"
"N3 3 7 0 4 2 1 6 5 N5|"
"N4 4 6 2 7 3 0 5 1 N1|"
"N5 5 4 7 1 3 2 6 0 N0|"
"N6 6 3 1 4 0 7 5 2 N2|"
"N7 7 2 0 6 4 1 5 3 N3",
"N7 7 2 0 6 4 1 5 3 N3|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N2 2 5 0 3 6 1 7 4 N4|"
"N3 3 7 0 4 2 1 6 5 N5|"
"N4 4 6 2 7 3 0 5 1 N1|"
"N5 5 4 7 1 3 2 6 0 N0|"
"N6 6 3 1 4 0 7 5 2 N2|"
"N7 7 2 0 6 4 1 5 3 N3|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N3 3 7 0 4 2 1 6 5 N5|"
"N4 4 6 2 7 3 0 5 1 N1|"
"N5 5 4 7 1 3 2 6 0 N0|"
"N6 6 3 1 4 0 7 5 2 N2|"
"N7 7 2 0 6 4 1 5 3 N3|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N2 2 5 0 3 6 1 7 4 N4|"
"N4 4 6 2 7 3 0 5 1 N1|"
"N5 5 4 7 1 3 2 6 0 N0|"
"N6 6 3 1 4 0 7 5 2 N2|"
"N7 7 2 0 6 4 1 5 3 N3|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N2 2 5 0 3 6 1 7 4 N4|"
"N3 3 7 0 4 2 1 6 5 N5|"
"N5 5 4 7 1 3 2 6 0 N0|"
"N6 6 3 1 4 0 7 5 2 N2|"
"N7 7 2 0 6 4 1 5 3 N3|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N2 2 5 0 3 6 1 7 4 N4|"
"N3 3 7 0 4 2 1 6 5 N5|"
"N4 4 6 2 7 3 0 5 1 N1|"
"N6 6 3 1 4 0 7 5 2 N2|"
"N7 7 2 0 6 4 1 5 3 N3|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N2 2 5 0 3 6 1 7 4 N4|"
"N3 3 7 0 4 2 1 6 5 N5|"
"N4 4 6 2 7 3 0 5 1 N1|"
"N5 5 4 7 1 3 2 6 0 N0|"
"N7 7 2 0 6 4 1 5 3 N3|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N2 2 5 0 3 6 1 7 4 N4|"
"N3 3 7 0 4 2 1 6 5 N5|"
"N4 4 6 2 7 3 0 5 1 N1|"
"N5 5 4 7 1 3 2 6 0 N0|"
"N6 6 3 1 4 0 7 5 2 N2|",
.ringTail2 = "N7 7 4 1 3 2 0 6 5 N5|"
"N6 6 3 0 7 5 1 4 2 N2|"
"N4 4 6 2 1 7 0 5 3 N3|"
@@ -619,14 +675,71 @@ static struct rcclRomeModel rome_model_81 = {
"N0 0 1 2 3 4 5 6 7 N7|"
"N2 2 5 0 3 6 4 7 1 N1|"
"N3 3 7 2 6 1 5 4 0 N0|"
"N7 7 4 1 3 2 0 6 5 N5|"
"N6 6 3 0 7 5 1 4 2 N2|"
"N4 4 6 2 1 7 0 5 3 N3|"
"N5 5 2 7 3 1 6 0 4 N4|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N2 2 5 0 3 6 4 7 1 N1|"
"N3 3 7 2 6 1 5 4 0 N0",
"N3 3 7 2 6 1 5 4 0 N0|"
"N7 7 4 1 3 2 0 6 5 N5|"
"N4 4 6 2 1 7 0 5 3 N3|"
"N5 5 2 7 3 1 6 0 4 N4|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N2 2 5 0 3 6 4 7 1 N1|"
"N3 3 7 2 6 1 5 4 0 N0|"
"N7 7 4 1 3 2 0 6 5 N5|"
"N6 6 3 0 7 5 1 4 2 N2|"
"N5 5 2 7 3 1 6 0 4 N4|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N2 2 5 0 3 6 4 7 1 N1|"
"N3 3 7 2 6 1 5 4 0 N0|"
"N7 7 4 1 3 2 0 6 5 N5|"
"N6 6 3 0 7 5 1 4 2 N2|"
"N4 4 6 2 1 7 0 5 3 N3|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N2 2 5 0 3 6 4 7 1 N1|"
"N3 3 7 2 6 1 5 4 0 N0|"
"N7 7 4 1 3 2 0 6 5 N5|"
"N6 6 3 0 7 5 1 4 2 N2|"
"N4 4 6 2 1 7 0 5 3 N3|"
"N5 5 2 7 3 1 6 0 4 N4|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N2 2 5 0 3 6 4 7 1 N1|"
"N3 3 7 2 6 1 5 4 0 N0|"
"N7 7 4 1 3 2 0 6 5 N5|"
"N6 6 3 0 7 5 1 4 2 N2|"
"N4 4 6 2 1 7 0 5 3 N3|"
"N5 5 2 7 3 1 6 0 4 N4|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N2 2 5 0 3 6 4 7 1 N1|"
"N3 3 7 2 6 1 5 4 0 N0|"
"N7 7 4 1 3 2 0 6 5 N5|"
"N6 6 3 0 7 5 1 4 2 N2|"
"N4 4 6 2 1 7 0 5 3 N3|"
"N5 5 2 7 3 1 6 0 4 N4|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N3 3 7 2 6 1 5 4 0 N0|"
"N7 7 4 1 3 2 0 6 5 N5|"
"N6 6 3 0 7 5 1 4 2 N2|"
"N4 4 6 2 1 7 0 5 3 N3|"
"N5 5 2 7 3 1 6 0 4 N4|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N2 2 5 0 3 6 4 7 1 N1|",
.ringTail1 = "N5 5 4 2 7 1 6 3 0 N0|"
"N2 2 5 0 3 7 4 6 1 N1|"
"N3 3 6 4 0 5 1 7 2 N2|"
@@ -635,14 +748,70 @@ static struct rcclRomeModel rome_model_81 = {
"N7 7 3 2 6 0 4 1 5 N5|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N5 5 4 2 7 1 6 3 0 N0|"
"N2 2 5 0 3 7 4 6 1 N1|"
"N3 3 6 4 0 5 1 7 2 N2|"
"N4 4 7 0 6 5 2 1 3 N3|"
"N6 6 2 0 7 5 3 1 4 N4|"
"N7 7 3 2 6 0 4 1 5 N5|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7",
"N0 0 1 2 3 4 5 6 7 N7|"
"N5 5 4 2 7 1 6 3 0 N0|"
"N3 3 6 4 0 5 1 7 2 N2|"
"N4 4 7 0 6 5 2 1 3 N3|"
"N6 6 2 0 7 5 3 1 4 N4|"
"N7 7 3 2 6 0 4 1 5 N5|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N5 5 4 2 7 1 6 3 0 N0|"
"N2 2 5 0 3 7 4 6 1 N1|"
"N4 4 7 0 6 5 2 1 3 N3|"
"N6 6 2 0 7 5 3 1 4 N4|"
"N7 7 3 2 6 0 4 1 5 N5|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N5 5 4 2 7 1 6 3 0 N0|"
"N2 2 5 0 3 7 4 6 1 N1|"
"N3 3 6 4 0 5 1 7 2 N2|"
"N6 6 2 0 7 5 3 1 4 N4|"
"N7 7 3 2 6 0 4 1 5 N5|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N5 5 4 2 7 1 6 3 0 N0|"
"N2 2 5 0 3 7 4 6 1 N1|"
"N3 3 6 4 0 5 1 7 2 N2|"
"N4 4 7 0 6 5 2 1 3 N3|"
"N7 7 3 2 6 0 4 1 5 N5|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N5 5 4 2 7 1 6 3 0 N0|"
"N2 2 5 0 3 7 4 6 1 N1|"
"N3 3 6 4 0 5 1 7 2 N2|"
"N4 4 7 0 6 5 2 1 3 N3|"
"N6 6 2 0 7 5 3 1 4 N4|"
"N1 1 0 2 4 3 5 7 6 N6|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N5 5 4 2 7 1 6 3 0 N0|"
"N2 2 5 0 3 7 4 6 1 N1|"
"N3 3 6 4 0 5 1 7 2 N2|"
"N4 4 7 0 6 5 2 1 3 N3|"
"N6 6 2 0 7 5 3 1 4 N4|"
"N7 7 3 2 6 0 4 1 5 N5|"
"N0 0 1 2 3 4 5 6 7 N7|"
"N5 5 4 2 7 1 6 3 0 N0|"
"N2 2 5 0 3 7 4 6 1 N1|"
"N3 3 6 4 0 5 1 7 2 N2|"
"N4 4 7 0 6 5 2 1 3 N3|"
"N6 6 2 0 7 5 3 1 4 N4|"
"N7 7 3 2 6 0 4 1 5 N5|"
"N1 1 0 2 4 3 5 7 6 N6|",
.options = "noCpuCheck=1,tuning=5",
};