Update D2D SDMA ganging for non-SPX modes

xGMI for compute partitioning in non-SPX modes does not have
a reported bandwith.
Fix it to at most 2 since each partition is either bounded
by the number of xGMI links or the number of available
SDMA contexts.

Change-Id: I09094bd7548d9eee6f039b0efe849838e5de166e


[ROCm/ROCR-Runtime commit: 4c74e47e91]
Este commit está contenido en:
Jonathan Kim
2023-05-19 17:17:50 -04:00
padre 2994cfa875
commit 58d5f7354f
Se han modificado 2 ficheros con 32 adiciones y 16 borrados
@@ -853,8 +853,7 @@ void GpuAgent::SetCopyStatusCheckRefCount(bool set) {
// Assign direct peer gang factor to GPU
void GpuAgent::RegisterGangPeer(core::Agent& peer, unsigned int max_bandwidth_factor) {
unsigned int max_gang_factor = std::min(max_bandwidth_factor, properties_.NumSdmaXgmiEngines);
gang_peers_info_.push_back(std::pair<core::Agent&,unsigned int>(peer, max_gang_factor));
gang_peers_info_.push_back(std::pair<core::Agent&,unsigned int>(peer, max_bandwidth_factor));
}
// Destroy gang signal
@@ -878,7 +877,7 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
}
// Calculate the number of gang items
int tmp_gang_factor = 1;
unsigned int tmp_gang_factor = 1;
for (auto peer_info : gang_peers_info_) {
Flag::SDMA_OVERRIDE sdma_gang_override =
core::Runtime::runtime_singleton_->flag().enable_sdma_gang();
@@ -902,19 +901,27 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
int gang_factor = 0;
uint32_t gang_mask = 0;
// Use non-D2D (auxillary) SDMA engines in the event of xGMI D2D support
// when xGMI SDMA context is not available.
bool has_aux_gang = tmp_gang_factor >= properties_.NumSdmaEngines && !!!properties_.NumSdmaXgmiEngines;
tmp_gang_factor = has_aux_gang ? tmp_gang_factor : std::min(tmp_gang_factor, properties_.NumSdmaXgmiEngines);
for (int i = 0; i < tmp_gang_factor; i++) {
uint32_t engine_offset = 0;
for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) {
if (xgmi_peer_list_[idx]->public_handle().handle == dst_agent.public_handle().handle) {
engine_offset = ((idx + i) % properties_.NumSdmaXgmiEngines) + DefaultBlitCount;
break;
if (has_aux_gang) {
if (!DmaEngineIsFree(i + 1) && !blits_[i + 1]->GangStatus()) continue;
} else {
uint32_t engine_offset = 0;
for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) {
if (xgmi_peer_list_[idx]->public_handle().handle == dst_agent.public_handle().handle) {
engine_offset = ((idx + i) % properties_.NumSdmaXgmiEngines) + DefaultBlitCount;
break;
}
}
}
// Avoid oversubscribing unavailable blit engines that are not already ganged
if (!!engine_offset && tmp_gang_factor > 1 && !DmaEngineIsFree(engine_offset) &&
!blits_[engine_offset]->GangStatus()) {
continue;
// Avoid oversubscribing unavailable blit engines that are not already ganged
if (!!engine_offset && tmp_gang_factor > 1 && !DmaEngineIsFree(engine_offset) &&
!blits_[engine_offset]->GangStatus()) {
continue;
}
}
gang_mask |= 1 << i;
@@ -958,7 +965,8 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
// Set leader and gang status to blit
SetCopyRequestRefCount(true);
lazy_ptr<core::Blit>& blit = GetBlitObject(dst_agent, src_agent, size, i);
lazy_ptr<core::Blit>& blit = has_aux_gang ? blits_[i + 1] :
GetBlitObject(dst_agent, src_agent, size, i);
blit->GangLeader(gang_factor > 1 && !gang_leader_set);
blit->GangStatus(gang_factor > 1);
@@ -376,15 +376,23 @@ void BuildTopology() {
continue;
auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id);
GpuAgent *gpu = (AMD::GpuAgent*)src_gpu;
// xGMI link type cannot determine bandwidth keep it fixed for ganging
bool has_fixed_gang = linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI &&
linfo.info.numa_distance != 15;
// Min Bandwidth < Max Bandwidth if source and destination GPUs are a
// single hop way and there exists more than a single xGMI link between
// them. Otherwise, destination GPU is not a gang candidate.
if (linfo.info.link_type != HSA_AMD_LINK_INFO_TYPE_XGMI ||
linfo.info.min_bandwidth == linfo.info.max_bandwidth)
(linfo.info.min_bandwidth == linfo.info.max_bandwidth && !has_fixed_gang)) {
continue;
}
((AMD::GpuAgent*)src_gpu)->RegisterGangPeer(*dst_gpu, linfo.info.max_bandwidth/linfo.info.min_bandwidth);
uint32_t gang_factor = has_fixed_gang ? 2 : (linfo.info.min_bandwidth ?
linfo.info.max_bandwidth/linfo.info.min_bandwidth : 0);
gpu->RegisterGangPeer(*dst_gpu, gang_factor);
}
}
}