Merge pull request #267 from wenkaidu/p2p
Limit P2P channels on Rome
[ROCm/rccl commit: 45a8f09e97]
This commit is contained in:
@@ -90,7 +90,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList,
|
||||
|
||||
ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
|
||||
// Only launch blocks where we have work to do.
|
||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
|
||||
if (comm->channels[c].collCount) params->gridDim.x = c+1;
|
||||
}
|
||||
|
||||
|
||||
@@ -510,9 +510,16 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
}
|
||||
}
|
||||
|
||||
// Round to next pow2 nChannelsPerPeer and nChannels
|
||||
comm->p2pnChannelsPerPeer = nextPow2(minChannels);
|
||||
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
|
||||
if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_4P2H_ROME) {
|
||||
// Adjust P2P channels on Rome
|
||||
comm->p2pnChannelsPerPeer = 2;
|
||||
comm->p2pnChannels = 2;
|
||||
}
|
||||
else {
|
||||
// Round to next pow2 nChannelsPerPeer and nChannels
|
||||
comm->p2pnChannelsPerPeer = nextPow2(minChannels);
|
||||
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
|
||||
}
|
||||
|
||||
// Init channels that weren't used so far
|
||||
for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
|
||||
|
||||
@@ -892,14 +892,18 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
|
||||
*str = 0;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int ncpus = system->nodes[CPU].count;
|
||||
// 8 GPUs and 4 numa nodes only
|
||||
if (ngpus != 8 || (ncpus != 4 && ncpus != 8))
|
||||
// 8 GPUs only
|
||||
if (ngpus != 8)
|
||||
return ncclSuccess;
|
||||
// only valid on Rome
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
|
||||
return ncclSuccess;
|
||||
system->type = RCCL_TOPO_4P2H_ROME;
|
||||
// 4 or 8 numa nodes only
|
||||
if (ncpus != 4 && ncpus != 8)
|
||||
return ncclSuccess;
|
||||
// number of GPUs and NICs on each numa node is used as first screening pattern
|
||||
char pattern[256];
|
||||
for (i = 0; i < ncpus; i++) {
|
||||
@@ -998,7 +1002,6 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
|
||||
}
|
||||
ringRemap[i] = 0;
|
||||
*str = ringRemap;
|
||||
system->type = RCCL_TOPO_4P2H_ROME;
|
||||
INFO(NCCL_GRAPH, "Use 4P2H on Rome: %s", ringRemap);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -304,7 +304,7 @@ group_cleanup:
|
||||
*args->init.newcomm = NULL;
|
||||
} else {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
for (int i=0; i<channel->collCount; i++) {
|
||||
channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
|
||||
|
||||
@@ -242,7 +242,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
free(prof);
|
||||
CUDACHECK(hipFree(comm->hostDevComm.devProf));
|
||||
|
||||
for (int channel=0; channel<comm->p2pnChannels; channel++) {
|
||||
for (int channel=0; channel<std::max(comm->nChannels, comm->p2pnChannels); channel++) {
|
||||
if (comm->channels[channel].send_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Send %6.2f GB/s (%ld bytes %d measurements)",
|
||||
comm->rank, channel, (comm->channels[channel].bw_count) ?
|
||||
(float)comm->channels[channel].bw_cumulative/comm->channels[channel].bw_count : 0,
|
||||
@@ -368,11 +368,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
|
||||
static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
// Duplicate the channels on the device
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels));
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels));
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, std::max(comm->nChannels, comm->p2pnChannels)));
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, std::max(comm->nChannels, comm->p2pnChannels)));
|
||||
|
||||
// Copy userRanks and peers
|
||||
for (int r=0; r<comm->p2pnChannels; r++) {
|
||||
for (int r=0; r<std::max(comm->nChannels, comm->p2pnChannels); r++) {
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
|
||||
}
|
||||
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user