Update rome models (#922)

[ROCm/rccl commit: 4278a9918b]
This commit is contained in:
Wenkai Du
2023-10-18 17:28:01 -07:00
کامیت شده توسط GitHub
والد 49e52e7269
کامیت edeea499b5
2فایلهای تغییر یافته به همراه21 افزوده شده و 3 حذف شده
@@ -591,7 +591,20 @@ static struct rcclRomeModel rome_model_82 = {
.gdrLevel = { },
.pattern = "4040",
.ringBase = "0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0",
.options = "noCpuCheck=1,mscclEnabled=1",
.options = "noCpuCheck=1,mscclEnabled=1,disableNumaMatching=1",
};
static struct rcclRomeModel rome_model_83 = {
.nGpus = 8, .nCpus = 2, .nNics = 8, .nLinks = 7,
.gpuIds = { 0xc000, 0x22000, 0x38000, 0x5c000, 0x9f000, 0xaf000, 0xbf000, 0xdf000, },
.nicIds = { 0x7000, 0x1d000, 0x33000, 0x57000, 0x9a000, 0xaa000, 0xba000, 0xda000, },
.gpuNuma = { 0, 0, 0, 0, 1, 1, 1, 1, },
.nicNuma = { 0, 0, 0, 0, 1, 1, 1, 1, },
.connMatrix = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, },
.gdrLevel = { PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, },
.pattern = "4444",
.ringBase = "N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 6 1 7 4 N4|N3 3 7 0 4 2 1 6 5 N5|N4 4 6 2 7 3 0 5 1 N1|N5 5 4 7 1 3 2 6 0 N0|N6 6 3 1 4 0 7 5 2 N2|N7 7 2 0 6 4 1 5 3 N3",
.options = "noCpuCheck=1,disableNumaMatching=1",
};
static struct rcclRomeModel romeTopoModels[] = {
@@ -636,7 +649,8 @@ static struct rcclRomeModel romeTopoModels[] = {
rome_model_79,
rome_model_80,
rome_model_81,
rome_model_82
rome_model_82,
rome_model_83,
};
/* Parse user defined rings. Format is like :
@@ -1281,7 +1295,7 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
if (!ignore_numa && strcmp(romeTopoModels[i].pattern, pattern)) continue;
// permute GPU IDs
for (int j = 0; j < ngpus; j++) g[j] = (j+2)%ngpus;
if (!permuteGpuIds(g, 0, ngpus-1, romeTopoModels+i, &romeTopo, &time, match_nbio, ignore_numa)) continue;
if (!permuteGpuIds(g, 0, ngpus-1, romeTopoModels+i, &romeTopo, &time, ignore_cpu ? false : match_nbio, ignore_numa)) continue;
if (nnets > 1) {
// permute NET IDs
for (int j = 0; j < nnets; j++) n[j] = (j+2)%nnets;
@@ -376,6 +376,10 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
gpu->gpu.gcn = "gfx908";
} else if (strcmp(gpu->gpu.gcn, "910") == 0) {
gpu->gpu.gcn = "gfx90a";
} else if (strcmp(gpu->gpu.gcn, "940") == 0) {
gpu->gpu.gcn = "gfx940";
} else if (strcmp(gpu->gpu.gcn, "941") == 0) {
gpu->gpu.gcn = "gfx941";
}
rcclHipDeviceArch_t arch;
NCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value));