Add support to another Rome model

Этот коммит содержится в:
Wenkai Du
2021-02-18 00:44:15 +00:00
родитель c985358e11
Коммит 95f178324c
3 изменённых файлов: 56 добавлений и 29 удалений
+26 -2
Просмотреть файл
@@ -20,8 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#define MAX_ROME_GPUS 8
#define MAX_ROME_NICS 2
#define MAX_ROME_GPUS 16
#define MAX_ROME_NICS 8
struct rcclRomeModel {
int nGpus;
@@ -235,6 +235,28 @@ static struct rcclRomeModel rome_model_46 = {
.ringBase = "6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2",
};
static struct rcclRomeModel rome_model_48 = {
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3,
.gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, },
.nicIds = { },
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
.nicNuma = { },
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
.pattern = "20202020",
.ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0",
};
static struct rcclRomeModel rome_model_49 = {
.nGpus = 8, .nCpus = 4, .nNics = 4, .nLinks = 3,
.gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, },
.nicIds = { 0x45000, 0x13000, 0xc6000, 0x85000, },
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
.nicNuma = { 0, 1, 2, 3, },
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
.pattern = "21212121",
.ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0",
};
static struct rcclRomeModel romeTopoModels[] = {
rome_model_22,
rome_model_25,
@@ -254,4 +276,6 @@ static struct rcclRomeModel romeTopoModels[] = {
rome_model_44,
rome_model_45,
rome_model_46,
rome_model_48,
rome_model_49,
};
+29 -26
Просмотреть файл
@@ -864,17 +864,6 @@ static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclT
return ncclSuccess;
}
static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *g, int *n, int nnet, int *net_map) {
*g = 0; *n = 0;
int i;
if (ncclTopoIdToIndex(system, CPU, id, &i) == ncclInternalError) return false;
for (int j = 0; j < nnet; j++)
if (system->nodes[NET].nodes[net_map[j]].paths[CPU][i].count == 2) (*n)++;
for (int j = 0; j < system->nodes[GPU].count; j++)
if (system->nodes[GPU].nodes[j].paths[CPU][i].count == 2) (*g)++;
return true;
}
static ncclResult_t ncclGpuIdToIndex(struct ncclTopoSystem* system, int id, int* index) {
*index = -1;
for (int i=0; i<system->nodes[GPU].count; i++) {
@@ -893,12 +882,18 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
romeTopo->nNics = 0;
romeTopo->nLinks = 0;
for (int i = 0; i < romeTopo->nGpus; i ++) {
int gpu, n;
int gpu, n, m, distance;
NCCLCHECK(ncclGpuIdToIndex(system, i, &gpu));
romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
for (n = 0; n < romeTopo->nCpus; n++)
if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count == 2) break;
if (n < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[n].id;
m = 0;
distance = system->nodes[GPU].nodes[gpu].paths[CPU][m].count;
for (n = 1; n < romeTopo->nCpus; n++) {
if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count < distance) {
distance = system->nodes[GPU].nodes[gpu].paths[CPU][n].count;
m = n;
}
}
if (m < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[m].id;
struct ncclTopoNode* node = system->nodes[GPU].nodes+gpu;
if (node->paths[GPU] == NULL) continue;
@@ -936,24 +931,32 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
}
}
for (int i = 0; i < romeTopo->nNics; i ++) {
int net, n, m, distance;
NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net));
m = 0;
distance = system->nodes[NET].nodes[net].paths[CPU][m].count;
for (n = 0; n < romeTopo->nCpus; n++)
if (system->nodes[NET].nodes[net].paths[CPU][n].count < distance) {
distance = system->nodes[NET].nodes[net].paths[CPU][n].count;
m = n;
}
if (m < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[m].id;
else return ncclSuccess;
}
// number of GPUs and NICs on each numa node is used as first screening pattern
for (int i = 0; i < romeTopo->nCpus; i++) {
int g, n;
getGpuNetCount(system, i, &g, &n, romeTopo->nNics, net_map);
int g = 0, n = 0;
for (int j = 0; j < romeTopo->nGpus; j++)
if (romeTopo->gpuNuma[j] == i) g++;
for (int j = 0; j < romeTopo->nNics; j++)
if (romeTopo->nicNuma[j] == i) n++;
pattern[i*2] = '0' + g;
pattern[i*2+1] = '0' + n;
}
pattern[romeTopo->nCpus*2] = 0;
for (int i = 0; i < romeTopo->nNics; i ++) {
int net, n;
NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net));
for (n = 0; n < romeTopo->nCpus; n++)
if (system->nodes[NET].nodes[net].paths[CPU][n].count == 2) break;
if (n < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[n].id;
else return ncclSuccess;
}
const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
if (romeModelFile) {
INFO(NCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
+1 -1
Просмотреть файл
@@ -163,7 +163,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
for (int i = 0; i < nranks; i++) {
memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId);
return ncclInvalidUsage;
}
}