Update D2D SDMA ganging for non-SPX modes
xGMI for compute partitioning in non-SPX modes does not have
a reported bandwith.
Fix it to at most 2 since each partition is either bounded
by the number of xGMI links or the number of available
SDMA contexts.
Change-Id: I09094bd7548d9eee6f039b0efe849838e5de166e
[ROCm/ROCR-Runtime commit: 4c74e47e91]
Este commit está contenido en:
@@ -853,8 +853,7 @@ void GpuAgent::SetCopyStatusCheckRefCount(bool set) {
|
||||
|
||||
// Assign direct peer gang factor to GPU
|
||||
void GpuAgent::RegisterGangPeer(core::Agent& peer, unsigned int max_bandwidth_factor) {
|
||||
unsigned int max_gang_factor = std::min(max_bandwidth_factor, properties_.NumSdmaXgmiEngines);
|
||||
gang_peers_info_.push_back(std::pair<core::Agent&,unsigned int>(peer, max_gang_factor));
|
||||
gang_peers_info_.push_back(std::pair<core::Agent&,unsigned int>(peer, max_bandwidth_factor));
|
||||
}
|
||||
|
||||
// Destroy gang signal
|
||||
@@ -878,7 +877,7 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
|
||||
}
|
||||
|
||||
// Calculate the number of gang items
|
||||
int tmp_gang_factor = 1;
|
||||
unsigned int tmp_gang_factor = 1;
|
||||
for (auto peer_info : gang_peers_info_) {
|
||||
Flag::SDMA_OVERRIDE sdma_gang_override =
|
||||
core::Runtime::runtime_singleton_->flag().enable_sdma_gang();
|
||||
@@ -902,19 +901,27 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
|
||||
|
||||
int gang_factor = 0;
|
||||
uint32_t gang_mask = 0;
|
||||
// Use non-D2D (auxillary) SDMA engines in the event of xGMI D2D support
|
||||
// when xGMI SDMA context is not available.
|
||||
bool has_aux_gang = tmp_gang_factor >= properties_.NumSdmaEngines && !!!properties_.NumSdmaXgmiEngines;
|
||||
tmp_gang_factor = has_aux_gang ? tmp_gang_factor : std::min(tmp_gang_factor, properties_.NumSdmaXgmiEngines);
|
||||
for (int i = 0; i < tmp_gang_factor; i++) {
|
||||
uint32_t engine_offset = 0;
|
||||
for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) {
|
||||
if (xgmi_peer_list_[idx]->public_handle().handle == dst_agent.public_handle().handle) {
|
||||
engine_offset = ((idx + i) % properties_.NumSdmaXgmiEngines) + DefaultBlitCount;
|
||||
break;
|
||||
if (has_aux_gang) {
|
||||
if (!DmaEngineIsFree(i + 1) && !blits_[i + 1]->GangStatus()) continue;
|
||||
} else {
|
||||
uint32_t engine_offset = 0;
|
||||
for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) {
|
||||
if (xgmi_peer_list_[idx]->public_handle().handle == dst_agent.public_handle().handle) {
|
||||
engine_offset = ((idx + i) % properties_.NumSdmaXgmiEngines) + DefaultBlitCount;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Avoid oversubscribing unavailable blit engines that are not already ganged
|
||||
if (!!engine_offset && tmp_gang_factor > 1 && !DmaEngineIsFree(engine_offset) &&
|
||||
!blits_[engine_offset]->GangStatus()) {
|
||||
continue;
|
||||
// Avoid oversubscribing unavailable blit engines that are not already ganged
|
||||
if (!!engine_offset && tmp_gang_factor > 1 && !DmaEngineIsFree(engine_offset) &&
|
||||
!blits_[engine_offset]->GangStatus()) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
gang_mask |= 1 << i;
|
||||
@@ -958,7 +965,8 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
|
||||
|
||||
// Set leader and gang status to blit
|
||||
SetCopyRequestRefCount(true);
|
||||
lazy_ptr<core::Blit>& blit = GetBlitObject(dst_agent, src_agent, size, i);
|
||||
lazy_ptr<core::Blit>& blit = has_aux_gang ? blits_[i + 1] :
|
||||
GetBlitObject(dst_agent, src_agent, size, i);
|
||||
blit->GangLeader(gang_factor > 1 && !gang_leader_set);
|
||||
blit->GangStatus(gang_factor > 1);
|
||||
|
||||
|
||||
@@ -376,15 +376,23 @@ void BuildTopology() {
|
||||
continue;
|
||||
|
||||
auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id);
|
||||
GpuAgent *gpu = (AMD::GpuAgent*)src_gpu;
|
||||
// xGMI link type cannot determine bandwidth keep it fixed for ganging
|
||||
bool has_fixed_gang = linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI &&
|
||||
linfo.info.numa_distance != 15;
|
||||
|
||||
// Min Bandwidth < Max Bandwidth if source and destination GPUs are a
|
||||
// single hop way and there exists more than a single xGMI link between
|
||||
// them. Otherwise, destination GPU is not a gang candidate.
|
||||
if (linfo.info.link_type != HSA_AMD_LINK_INFO_TYPE_XGMI ||
|
||||
linfo.info.min_bandwidth == linfo.info.max_bandwidth)
|
||||
(linfo.info.min_bandwidth == linfo.info.max_bandwidth && !has_fixed_gang)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
((AMD::GpuAgent*)src_gpu)->RegisterGangPeer(*dst_gpu, linfo.info.max_bandwidth/linfo.info.min_bandwidth);
|
||||
uint32_t gang_factor = has_fixed_gang ? 2 : (linfo.info.min_bandwidth ?
|
||||
linfo.info.max_bandwidth/linfo.info.min_bandwidth : 0);
|
||||
|
||||
gpu->RegisterGangPeer(*dst_gpu, gang_factor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Referencia en una nueva incidencia
Block a user