Consistent channel shuffling for MI300X multi-node (#1255)
* Revert "[GRAPH] Use channel shuffling only for IB systems (#1228)" This reverts commit5be3b713ef. * Revert "Revert "Changing channel stride for MI300X multinode (#1196)" (#1224)" This reverts commitad31d93f3d.
Bu işleme şunda yer alıyor:
işlemeyi yapan:
GitHub
ebeveyn
67e867271f
işleme
a1ef217b32
@@ -624,29 +624,11 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
|
||||
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
|
||||
|
||||
// Define channels for non-gfx94 GPU architectures
|
||||
int maxChannels = 2*CHANNEL_LIMIT;
|
||||
int multiNodeNchannels = maxChannels;
|
||||
|
||||
// Define channels for gfx94 GPU architectures
|
||||
if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
|
||||
// Only use full MAXCHANNELS for gfx94x
|
||||
maxChannels = MAXCHANNELS;
|
||||
|
||||
// Define channels=64 for gfx94 multi-node systems
|
||||
multiNodeNchannels = 64;
|
||||
|
||||
// Check if NCCL_IB_GID_INDEX=3 -- needed for RoCE systems
|
||||
const char* ncclIbGidIndex = ncclGetEnv("NCCL_IB_GID_INDEX");
|
||||
int gid_index = 0;
|
||||
if (ncclIbGidIndex) gid_index = atoi(ncclIbGidIndex);
|
||||
|
||||
// Limit channels=48 for RoCE gfx94 multi-node systems
|
||||
multiNodeNchannels = gid_index == 3 ? 48 : multiNodeNchannels;
|
||||
}
|
||||
// Only use full MAXCHANNELS for gfx94x
|
||||
int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? MAXCHANNELS : 2*CHANNEL_LIMIT;
|
||||
|
||||
if (graphs[NCCL_ALGO_RING]->nIntraChannels > 0 || comm->nNodes > 1) {
|
||||
maxChannels = std::min(multiNodeNchannels, maxChannels);
|
||||
maxChannels = std::min(64, maxChannels);
|
||||
}
|
||||
|
||||
// Duplicate ringPrev/ringNext for ncclBuildRing
|
||||
@@ -692,7 +674,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
|
||||
int minNchannels = ncclMinNchannels();
|
||||
if (comm->nNodes > 1) {
|
||||
minNchannels = std::min(multiNodeNchannels, minNchannels);
|
||||
minNchannels = std::min(64, minNchannels);
|
||||
}
|
||||
if (comm->nRanks < 8 && 64 < minNchannels) {
|
||||
minNchannels = 2;
|
||||
|
||||
@@ -28,7 +28,6 @@ THE SOFTWARE.
|
||||
#include <algorithm>
|
||||
#include <string.h>
|
||||
#include "rome_models.h"
|
||||
#include "param.h"
|
||||
|
||||
struct rcclRomeModel {
|
||||
int nGpus;
|
||||
@@ -813,7 +812,7 @@ static struct rcclRomeModel rome_model_81 = {
|
||||
"N7 7 3 2 6 0 4 1 5 N5|"
|
||||
"N1 1 0 2 4 3 5 7 6 N6|",
|
||||
|
||||
.options = "noCpuCheck=1,tuning=5,disableNumaMatching=1,isRoCE=0",
|
||||
.options = "noCpuCheck=1,tuning=5,disableNumaMatching=1",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_84 = {
|
||||
@@ -842,114 +841,6 @@ static struct rcclRomeModel rome_model_85 = {
|
||||
.options = "tuning=2",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_86 = {
|
||||
.nGpus = 8, .nCpus = 2, .nNics = 8, .nLinks = 7,
|
||||
.gpuIds = { 0xc000, 0x22000, 0x38000, 0x5c000, 0x9f000, 0xaf000, 0xbf000, 0xdf000, },
|
||||
.nicIds = { 0x7000, 0x1d000, 0x33000, 0x57000, 0x9a000, 0xaa000, 0xba000, 0xda000, },
|
||||
.gpuNuma = { 0, 0, 0, 0, 1, 1, 1, 1, },
|
||||
.nicNuma = { 0, 0, 0, 0, 1, 1, 1, 1, },
|
||||
.connMatrix = { 0, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 0, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 0, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 0, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 0, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 0, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 0, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 0, },
|
||||
.gdrLevel = {PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
|
||||
PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
|
||||
PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
|
||||
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
|
||||
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB,
|
||||
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB,
|
||||
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB,
|
||||
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, },
|
||||
.pattern = "4444",
|
||||
.ringBase = "N0 0 1 2 3 4 5 6 7 N7|"
|
||||
"N1 1 0 2 4 3 5 7 6 N6|"
|
||||
"N2 2 5 0 3 6 1 7 4 N4|"
|
||||
"N3 3 7 0 4 2 1 6 5 N5|"
|
||||
"N4 4 6 2 7 3 0 5 1 N1|"
|
||||
"N5 5 4 7 1 3 2 6 0 N0|"
|
||||
"N6 6 3 1 4 0 7 5 2 N2|"
|
||||
"N7 7 2 0 6 4 1 5 3 N3|"
|
||||
|
||||
"N0 0 1 2 3 4 5 6 7 N7|"
|
||||
"N1 1 0 2 4 3 5 7 6 N6|"
|
||||
"N2 2 5 0 3 6 1 7 4 N4|"
|
||||
"N3 3 7 0 4 2 1 6 5 N5|"
|
||||
"N4 4 6 2 7 3 0 5 1 N1|"
|
||||
"N5 5 4 7 1 3 2 6 0 N0|"
|
||||
"N6 6 3 1 4 0 7 5 2 N2|"
|
||||
"N7 7 2 0 6 4 1 5 3 N3|"
|
||||
|
||||
"N0 0 1 2 3 4 5 6 7 N7|"
|
||||
"N1 1 0 2 4 3 5 7 6 N6|"
|
||||
"N2 2 5 0 3 6 1 7 4 N4|"
|
||||
"N3 3 7 0 4 2 1 6 5 N5|"
|
||||
"N4 4 6 2 7 3 0 5 1 N1|"
|
||||
"N5 5 4 7 1 3 2 6 0 N0|"
|
||||
"N6 6 3 1 4 0 7 5 2 N2|"
|
||||
"N7 7 2 0 6 4 1 5 3 N3",
|
||||
|
||||
.ringTail2 = "N7 7 4 1 3 2 0 6 5 N5|"
|
||||
"N6 6 3 0 7 5 1 4 2 N2|"
|
||||
"N4 4 6 2 1 7 0 5 3 N3|"
|
||||
"N5 5 2 7 3 1 6 0 4 N4|"
|
||||
"N1 1 0 2 4 3 5 7 6 N6|"
|
||||
"N0 0 1 2 3 4 5 6 7 N7|"
|
||||
"N2 2 5 0 3 6 4 7 1 N1|"
|
||||
"N3 3 7 2 6 1 5 4 0 N0|"
|
||||
|
||||
"N7 7 4 1 3 2 0 6 5 N5|"
|
||||
"N6 6 3 0 7 5 1 4 2 N2|"
|
||||
"N4 4 6 2 1 7 0 5 3 N3|"
|
||||
"N5 5 2 7 3 1 6 0 4 N4|"
|
||||
"N1 1 0 2 4 3 5 7 6 N6|"
|
||||
"N0 0 1 2 3 4 5 6 7 N7|"
|
||||
"N2 2 5 0 3 6 4 7 1 N1|"
|
||||
"N3 3 7 2 6 1 5 4 0 N0|"
|
||||
|
||||
"N7 7 4 1 3 2 0 6 5 N5|"
|
||||
"N6 6 3 0 7 5 1 4 2 N2|"
|
||||
"N4 4 6 2 1 7 0 5 3 N3|"
|
||||
"N5 5 2 7 3 1 6 0 4 N4|"
|
||||
"N1 1 0 2 4 3 5 7 6 N6|"
|
||||
"N0 0 1 2 3 4 5 6 7 N7|"
|
||||
"N2 2 5 0 3 6 4 7 1 N1|"
|
||||
"N3 3 7 2 6 1 5 4 0 N0",
|
||||
|
||||
|
||||
.ringTail1 = "N5 5 4 2 7 1 6 3 0 N0|"
|
||||
"N2 2 5 0 3 7 4 6 1 N1|"
|
||||
"N3 3 6 4 0 5 1 7 2 N2|"
|
||||
"N4 4 7 0 6 5 2 1 3 N3|"
|
||||
"N6 6 2 0 7 5 3 1 4 N4|"
|
||||
"N7 7 3 2 6 0 4 1 5 N5|"
|
||||
"N1 1 0 2 4 3 5 7 6 N6|"
|
||||
"N0 0 1 2 3 4 5 6 7 N7|"
|
||||
|
||||
"N5 5 4 2 7 1 6 3 0 N0|"
|
||||
"N2 2 5 0 3 7 4 6 1 N1|"
|
||||
"N3 3 6 4 0 5 1 7 2 N2|"
|
||||
"N4 4 7 0 6 5 2 1 3 N3|"
|
||||
"N6 6 2 0 7 5 3 1 4 N4|"
|
||||
"N7 7 3 2 6 0 4 1 5 N5|"
|
||||
"N1 1 0 2 4 3 5 7 6 N6|"
|
||||
"N0 0 1 2 3 4 5 6 7 N7|"
|
||||
|
||||
"N5 5 4 2 7 1 6 3 0 N0|"
|
||||
"N2 2 5 0 3 7 4 6 1 N1|"
|
||||
"N3 3 6 4 0 5 1 7 2 N2|"
|
||||
"N4 4 7 0 6 5 2 1 3 N3|"
|
||||
"N6 6 2 0 7 5 3 1 4 N4|"
|
||||
"N7 7 3 2 6 0 4 1 5 N5|"
|
||||
"N1 1 0 2 4 3 5 7 6 N6|"
|
||||
"N0 0 1 2 3 4 5 6 7 N7",
|
||||
|
||||
.options = "noCpuCheck=1,tuning=5,disableNumaMatching=1,isRoCE=1",
|
||||
};
|
||||
|
||||
|
||||
static struct rcclRomeModel romeTopoModels[] = {
|
||||
rome_model_22, /* 0 */
|
||||
@@ -995,7 +886,6 @@ static struct rcclRomeModel romeTopoModels[] = {
|
||||
rome_model_81, /* 40 */
|
||||
rome_model_84, /* 41 */
|
||||
rome_model_85, /* 42 */
|
||||
rome_model_86, /* 43 */
|
||||
};
|
||||
|
||||
/* Parse user defined rings. Format is like :
|
||||
@@ -1283,27 +1173,6 @@ static bool checkOption(const char *options, const char *name) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static int checkOptionValue(const char *options, const char *name) {
|
||||
if (strcmp(options, "")) {
|
||||
char *str_temp = (char *)malloc(strlen(options) + 1);
|
||||
strcpy(str_temp, options);
|
||||
char* tokens[MAX_OPT_TOKENS];
|
||||
int numTokens = 0;
|
||||
char* state;
|
||||
tokens[numTokens] = strtok_r(str_temp, "=, ", &state);
|
||||
numTokens++;
|
||||
while (tokens[numTokens-1] != NULL && numTokens < MAX_OPT_TOKENS)
|
||||
tokens[numTokens++] = strtok_r(NULL, "=, ", &state);
|
||||
for (int i = 0; i < numTokens/2; i++) {
|
||||
if (strcmp(tokens[i*2], name) == 0) {
|
||||
return atol(tokens[i*2+1]);
|
||||
}
|
||||
}
|
||||
free(str_temp);
|
||||
}
|
||||
return -2;
|
||||
}
|
||||
|
||||
ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
static const char *ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3";
|
||||
int id[8], dist[8];
|
||||
@@ -1667,24 +1536,12 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
|
||||
}
|
||||
if (i < romeTopo.nGpus) match_nbio = false;
|
||||
|
||||
// check if NCCL_IB_GID_INDEX=3 -- needed for RoCE systems
|
||||
const char* ncclIbGidIndex = ncclGetEnv("NCCL_IB_GID_INDEX");
|
||||
int gid_index = 0;
|
||||
if (ncclIbGidIndex) gid_index = atoi(ncclIbGidIndex);
|
||||
int isRoCE = gid_index == 3 ? 1 : 0;
|
||||
|
||||
for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) {
|
||||
bool ignore_cpu = checkOption(romeTopoModels[i].options, "noCpuCheck");
|
||||
if (!ignore_cpu && (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME))
|
||||
continue;
|
||||
|
||||
bool ignore_numa = checkOption(romeTopoModels[i].options, "disableNumaMatching");
|
||||
if (!ignore_numa && romeTopo.nCpus != romeTopoModels[i].nCpus) continue;
|
||||
|
||||
// check if "isRoCE=1" is defined in model struct options
|
||||
int optionsIsRoCE = checkOptionValue(romeTopoModels[i].options, "isRoCE");
|
||||
if (optionsIsRoCE != -2 && optionsIsRoCE != isRoCE) continue;
|
||||
|
||||
if (romeTopo.nGpus != romeTopoModels[i].nGpus ||
|
||||
romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue;
|
||||
if (!ignore_numa && strcmp(romeTopoModels[i].pattern, pattern)) continue;
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle