From 58d5f7354f7d0ef812fbbe0d1e1c1913870be34f Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Fri, 19 May 2023 17:17:50 -0400 Subject: [PATCH] Update D2D SDMA ganging for non-SPX modes xGMI for compute partitioning in non-SPX modes does not have a reported bandwith. Fix it to at most 2 since each partition is either bounded by the number of xGMI links or the number of available SDMA contexts. Change-Id: I09094bd7548d9eee6f039b0efe849838e5de166e [ROCm/ROCR-Runtime commit: 4c74e47e91a69e14a5ac77ce8596807a0207d1a3] --- .../core/runtime/amd_gpu_agent.cpp | 36 +++++++++++-------- .../hsa-runtime/core/runtime/amd_topology.cpp | 12 +++++-- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 3c3a7ebc5f..b007c0fbd9 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -853,8 +853,7 @@ void GpuAgent::SetCopyStatusCheckRefCount(bool set) { // Assign direct peer gang factor to GPU void GpuAgent::RegisterGangPeer(core::Agent& peer, unsigned int max_bandwidth_factor) { - unsigned int max_gang_factor = std::min(max_bandwidth_factor, properties_.NumSdmaXgmiEngines); - gang_peers_info_.push_back(std::pair(peer, max_gang_factor)); + gang_peers_info_.push_back(std::pair(peer, max_bandwidth_factor)); } // Destroy gang signal @@ -878,7 +877,7 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, } // Calculate the number of gang items - int tmp_gang_factor = 1; + unsigned int tmp_gang_factor = 1; for (auto peer_info : gang_peers_info_) { Flag::SDMA_OVERRIDE sdma_gang_override = core::Runtime::runtime_singleton_->flag().enable_sdma_gang(); @@ -902,19 +901,27 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, int gang_factor = 0; uint32_t gang_mask = 0; + // Use non-D2D (auxillary) SDMA engines in the event of xGMI D2D support + // when xGMI SDMA context is not available. + bool has_aux_gang = tmp_gang_factor >= properties_.NumSdmaEngines && !!!properties_.NumSdmaXgmiEngines; + tmp_gang_factor = has_aux_gang ? tmp_gang_factor : std::min(tmp_gang_factor, properties_.NumSdmaXgmiEngines); for (int i = 0; i < tmp_gang_factor; i++) { - uint32_t engine_offset = 0; - for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) { - if (xgmi_peer_list_[idx]->public_handle().handle == dst_agent.public_handle().handle) { - engine_offset = ((idx + i) % properties_.NumSdmaXgmiEngines) + DefaultBlitCount; - break; + if (has_aux_gang) { + if (!DmaEngineIsFree(i + 1) && !blits_[i + 1]->GangStatus()) continue; + } else { + uint32_t engine_offset = 0; + for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) { + if (xgmi_peer_list_[idx]->public_handle().handle == dst_agent.public_handle().handle) { + engine_offset = ((idx + i) % properties_.NumSdmaXgmiEngines) + DefaultBlitCount; + break; + } } - } - // Avoid oversubscribing unavailable blit engines that are not already ganged - if (!!engine_offset && tmp_gang_factor > 1 && !DmaEngineIsFree(engine_offset) && - !blits_[engine_offset]->GangStatus()) { - continue; + // Avoid oversubscribing unavailable blit engines that are not already ganged + if (!!engine_offset && tmp_gang_factor > 1 && !DmaEngineIsFree(engine_offset) && + !blits_[engine_offset]->GangStatus()) { + continue; + } } gang_mask |= 1 << i; @@ -958,7 +965,8 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, // Set leader and gang status to blit SetCopyRequestRefCount(true); - lazy_ptr& blit = GetBlitObject(dst_agent, src_agent, size, i); + lazy_ptr& blit = has_aux_gang ? blits_[i + 1] : + GetBlitObject(dst_agent, src_agent, size, i); blit->GangLeader(gang_factor > 1 && !gang_leader_set); blit->GangStatus(gang_factor > 1); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp index 77601b24e5..1f5aa53109 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp @@ -376,15 +376,23 @@ void BuildTopology() { continue; auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id); + GpuAgent *gpu = (AMD::GpuAgent*)src_gpu; + // xGMI link type cannot determine bandwidth keep it fixed for ganging + bool has_fixed_gang = linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && + linfo.info.numa_distance != 15; // Min Bandwidth < Max Bandwidth if source and destination GPUs are a // single hop way and there exists more than a single xGMI link between // them. Otherwise, destination GPU is not a gang candidate. if (linfo.info.link_type != HSA_AMD_LINK_INFO_TYPE_XGMI || - linfo.info.min_bandwidth == linfo.info.max_bandwidth) + (linfo.info.min_bandwidth == linfo.info.max_bandwidth && !has_fixed_gang)) { continue; + } - ((AMD::GpuAgent*)src_gpu)->RegisterGangPeer(*dst_gpu, linfo.info.max_bandwidth/linfo.info.min_bandwidth); + uint32_t gang_factor = has_fixed_gang ? 2 : (linfo.info.min_bandwidth ? + linfo.info.max_bandwidth/linfo.info.min_bandwidth : 0); + + gpu->RegisterGangPeer(*dst_gpu, gang_factor); } } }