diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index cac4096760..b8a5a41f58 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -600,7 +600,7 @@ class GpuAgent : public GpuAgentInt { void ReleaseScratch(void* base, size_t size, bool large); // Bind index of peer device that is connected via xGMI links - lazy_ptr& GetXgmiBlit(const core::Agent& peer_agent, int gang_id); + lazy_ptr& GetXgmiBlit(const core::Agent& peer_agent); // Bind the Blit object that will drive the copy operation // across PCIe links (H2D or D2H) or is within same device D2D @@ -608,7 +608,7 @@ class GpuAgent : public GpuAgentInt { // Bind the Blit object that will drive the copy operation lazy_ptr& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent, - const size_t size, int gang_id); + const size_t size); // Bind the Blit object that will drive the copy operation by engine ID lazy_ptr& GetBlitObject(uint32_t engine_id); @@ -662,7 +662,7 @@ class GpuAgent : public GpuAgentInt { // Check if SDMA engine by ID is free bool DmaEngineIsFree(uint32_t engine_id); - std::vector> gang_peers_info_; + std::map gang_peers_info_; }; } // namespace amd diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index a77cd65ed1..a88bd5e7c7 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -882,7 +882,7 @@ void GpuAgent::SetCopyStatusCheckRefCount(bool set) { // Assign direct peer gang factor to GPU void GpuAgent::RegisterGangPeer(core::Agent& peer, unsigned int max_bandwidth_factor) { - gang_peers_info_.push_back(std::pair(peer, max_bandwidth_factor)); + gang_peers_info_[peer.public_handle().handle] = max_bandwidth_factor; } // Destroy gang signal @@ -905,59 +905,22 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, out_signal.async_copy_agent(core::Agent::Convert(this->public_handle())); } - ScopedAcquire lock(&sdma_gang_lock_); - // Calculate the number of gang items - unsigned int tmp_gang_factor = 1; - for (auto peer_info : gang_peers_info_) { - Flag::SDMA_OVERRIDE sdma_gang_override = - core::Runtime::runtime_singleton_->flag().enable_sdma_gang(); - Flag::SDMA_OVERRIDE sdma_override = - core::Runtime::runtime_singleton_->flag().enable_sdma(); - // Blit copies already saturate xGMI - if (sdma_override == Flag::SDMA_DISABLE || sdma_gang_override == Flag::SDMA_DISABLE) { - break; - } + unsigned int gang_factor = 1; + if (core::Runtime::runtime_singleton_->flag().enable_sdma_gang() != Flag::SDMA_DISABLE && + size >= 4096 && dst_agent.device_type() == core::Agent::kAmdGpuDevice) + gang_factor = gang_peers_info_[dst_agent.public_handle().handle]; - // Avoid the latency boundary on small copies - if (size < HSA_PAGE_SIZE_4KB) { - break; - } - - if (dst_agent.public_handle().handle == peer_info.first.public_handle().handle) { - tmp_gang_factor = peer_info.second; - break; - } - } - - int gang_factor = 0; // Use non-D2D (auxillary) SDMA engines in the event of xGMI D2D support // when xGMI SDMA context is not available. - bool has_aux_gang = tmp_gang_factor >= properties_.NumSdmaEngines && !!!properties_.NumSdmaXgmiEngines; - tmp_gang_factor = has_aux_gang ? tmp_gang_factor : std::min(tmp_gang_factor, properties_.NumSdmaXgmiEngines); - for (int i = 0; i < tmp_gang_factor; i++) { - if (has_aux_gang && !DmaEngineIsFree(i + 1)) { - break; - } else { - uint32_t engine_offset = 0; - for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) { - if (xgmi_peer_list_[idx]->public_handle().handle == dst_agent.public_handle().handle) { - engine_offset = ((idx + i) % properties_.NumSdmaXgmiEngines) + DefaultBlitCount; - break; - } - } - - // Avoid oversubscribing unavailable blit engines that are not already ganged - if (!!engine_offset && tmp_gang_factor > 1 && !DmaEngineIsFree(engine_offset)) { - break; - } - } - - gang_factor++; - } - - if (!gang_factor) gang_factor = 1; + bool has_aux_gang = gang_factor >= properties_.NumSdmaEngines && + !!!properties_.NumSdmaXgmiEngines; + gang_factor = has_aux_gang ? + std::min(gang_factor, properties_.NumSdmaEngines) : + std::min(gang_factor, properties_.NumSdmaXgmiEngines); + ScopedAcquire lock(&sdma_gang_lock_); + if (gang_factor == 1) sdma_gang_lock_.Release(); // Manage internal gang signals std::vector gang_signals; if (gang_factor > 1) { @@ -985,22 +948,23 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, // Bind the Blit object that will drive this copy operation size_t offset = 0, remainder_size = size; - bool gang_leader_set = false; int gang_sig_count = 0; for (int i = 0; i < gang_factor; i++) { // Set leader and gang status to blit SetCopyRequestRefCount(true); MAKE_SCOPE_GUARD([&]() { SetCopyRequestRefCount(false); }); - lazy_ptr& blit = has_aux_gang ? blits_[i + 1] : - GetBlitObject(dst_agent, src_agent, size, i); - blit->GangLeader(gang_factor > 1 && !gang_leader_set); + lazy_ptr& blit = gang_factor > 1 ? + (has_aux_gang ? blits_[i + 1] : blits_[i + DefaultBlitCount]) : + GetBlitObject(dst_agent, src_agent, size); + blit->GangLeader(gang_factor > 1 && !i); hsa_status_t stat; size_t chunk = std::min(remainder_size, (size + gang_factor - 1)/gang_factor); if (!blit->GangLeader() && !gang_signals.empty()) { + std::vector dep_signals_null(0); // only leader has to wait on dependencies stat = blit->SubmitLinearCopyCommand(reinterpret_cast(dst) + offset, reinterpret_cast(src) + offset, - chunk, dep_signals, + chunk, dep_signals_null, *gang_signals[gang_sig_count], gang_signals); gang_sig_count++; } else { @@ -1015,7 +979,6 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, offset += chunk; remainder_size -= chunk; - gang_leader_set = true; } return HSA_STATUS_SUCCESS; @@ -2173,7 +2136,7 @@ lazy_ptr& GpuAgent::GetBlitObject(uint32_t engine_offset) { return blits_[engine_offset]; } -lazy_ptr& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent, int gang_id) { +lazy_ptr& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) { // Determine if destination is a member xgmi peers list uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines; assert((xgmi_engine_cnt > 0) && ("Illegal condition, should not happen")); @@ -2184,7 +2147,7 @@ lazy_ptr& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent, int ga uint64_t dst_handle = dst_agent.public_handle().handle; uint64_t peer_handle = xgmi_peer_list_[idx]->public_handle().handle; if (peer_handle == dst_handle) { - return blits_[((idx + gang_id) % xgmi_engine_cnt) + DefaultBlitCount]; + return blits_[(idx % xgmi_engine_cnt) + DefaultBlitCount]; } } @@ -2204,8 +2167,7 @@ lazy_ptr& GpuAgent::GetPcieBlit(const core::Agent& dst_agent, lazy_ptr& GpuAgent::GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent, - const size_t size, - int gang_id) { + const size_t size) { // At this point it is guaranteed that one of // the two devices is a GPU, potentially both assert(((src_agent.device_type() == core::Agent::kAmdGpuDevice) || @@ -2265,7 +2227,7 @@ lazy_ptr& GpuAgent::GetBlitObject(const core::Agent& dst_agent, return GetPcieBlit(dst_agent, src_agent); } - return GetXgmiBlit(dst_agent, gang_id); + return GetXgmiBlit(dst_agent); } void GpuAgent::Trim() { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp index 142175c326..89c5cd2600 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp @@ -372,26 +372,26 @@ void BuildTopology() { uint32_t src_id = src_gpu->node_id(); for (auto& dst_gpu : core::Runtime::runtime_singleton_->gpu_agents()) { uint32_t dst_id = dst_gpu->node_id(); + uint32_t gang_factor = 1; - if (src_id == dst_gpu->node_id()) - continue; - - auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id); - // xGMI link type cannot determine bandwidth keep it fixed for ganging - bool has_fixed_gang = linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && - linfo.info.numa_distance != 15; - - // Min Bandwidth < Max Bandwidth if source and destination GPUs are a - // single hop way and there exists more than a single xGMI link between - // them. Otherwise, destination GPU is not a gang candidate. - if (linfo.info.link_type != HSA_AMD_LINK_INFO_TYPE_XGMI || - (linfo.info.min_bandwidth == linfo.info.max_bandwidth && !has_fixed_gang)) { - continue; + if (src_id != dst_id) { + auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id); + // Ganging can only be done over xGMI and is either fixed or variable + // based on topology information: + // Weight of 13 - Intra-socket GPU link in multi-partition mode + // Weigth of 15 - Direct GPU link in single partition mode + // Weight of 41 - Inter-socket GPU link in multi-partition mode + if (linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) { + if (linfo.info.numa_distance == 13 || linfo.info.numa_distance == 41) + gang_factor = 2; + else if (linfo.info.numa_distance == 15 && linfo.info.min_bandwidth) + gang_factor = linfo.info.max_bandwidth/linfo.info.min_bandwidth; + else gang_factor = 1; + } } - uint32_t gang_factor = has_fixed_gang ? 2 : (linfo.info.min_bandwidth ? - linfo.info.max_bandwidth/linfo.info.min_bandwidth : 0); - + // Register all GPUs regardless of connection type to take advantage of easy + // key-value lookup later on. ((AMD::GpuAgent*)src_gpu)->RegisterGangPeer(*dst_gpu, gang_factor); } } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h index 304b367ffb..170fd54eee 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h @@ -94,6 +94,7 @@ class Flag { var = os::GetEnvVar("HSA_ENABLE_SDMA_GANG"); enable_sdma_gang_ = (var == "0") ? SDMA_DISABLE : ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT); + if (enable_sdma_ == SDMA_DISABLE) enable_sdma_gang_ = SDMA_DISABLE; var = os::GetEnvVar("HSA_ENABLE_SDMA_COPY_SIZE_OVERRIDE"); enable_sdma_copy_size_override_ = (var == "0") ? SDMA_DISABLE :