diff --git a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h index 4eea7c746d..8852d40c1b 100644 --- a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h +++ b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h @@ -66,7 +66,7 @@ class BlitSdmaBase : public core::Blit { static const size_t kMaxSingleFillSize; virtual bool isSDMA() const override { return true; } virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi, - size_t linear_copy_size_override) = 0; + size_t linear_copy_size_override, int rec_engine) = 0; virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, @@ -92,7 +92,7 @@ class BlitSdma : public BlitSdmaBase { /// /// @return hsa_status_t virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi, - size_t linear_copy_size_override) override; + size_t linear_copy_size_override, int rec_eng) override; /// @brief Marks the queue object as invalid and uncouples its link with /// the underlying compute device's control block. Use of queue object diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 7b47fa01ae..b105dbd6be 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -161,6 +161,8 @@ class GpuAgentInt : public core::Agent { virtual void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) = 0; + virtual void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) = 0; + // @brief Query if agent represent Kaveri GPU. // // @retval true if agent is Kaveri GPU. @@ -336,6 +338,8 @@ class GpuAgent : public GpuAgentInt { void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) override; + void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) override; + // Getter & setters. // @brief Returns Hive ID @@ -457,7 +461,7 @@ class GpuAgent : public GpuAgentInt { // @brief Create SDMA blit object. // // @retval NULL if SDMA blit creation and initialization failed. - core::Blit* CreateBlitSdma(bool use_xgmi); + core::Blit* CreateBlitSdma(bool use_xgmi, int rec_eng); // @brief Create Kernel blit object using provided compute queue. // @@ -761,6 +765,10 @@ class GpuAgent : public GpuAgentInt { bool DmaEngineIsFree(uint32_t engine_id); std::map gang_peers_info_; + + std::map rec_sdma_eng_id_peers_info_; + + bool uses_rec_sdma_eng_id_mask_; }; } // namespace amd diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h index b9af49cac3..c1bc7cdb29 100644 --- a/runtime/hsa-runtime/core/inc/runtime.h +++ b/runtime/hsa-runtime/core/inc/runtime.h @@ -115,9 +115,10 @@ class Runtime { public: /// @brief Structure to describe connectivity between agents. struct LinkInfo { - LinkInfo() : num_hop(0), info{0} {} + LinkInfo() : num_hop(0), rec_sdma_eng_id_mask(0), info{0} {} uint32_t num_hop; + uint32_t rec_sdma_eng_id_mask; hsa_amd_memory_pool_link_info_t info; }; @@ -167,7 +168,7 @@ class Runtime { /// @param [in] link_info The link information between source and destination /// nodes. void RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to, - uint32_t num_hop, + uint32_t num_hop, uint32_t rec_sdma_eng_id_mask, hsa_amd_memory_pool_link_info_t& link_info); /// @brief Query link information between two nodes. diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index c823e59087..f8bec1d84c 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -283,10 +283,10 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr HSAKMT_STATUS kmt_status; if (core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging) { queue_rsrc.ErrorReason = &exception_signal_->signal_.value; - kmt_status = hsaKmtCreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, ring_buf_, + kmt_status = hsaKmtCreateQueueExt(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, 0, ring_buf_, ring_buf_alloc_bytes_, queue_event_, &queue_rsrc); } else { - kmt_status = hsaKmtCreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, ring_buf_, + kmt_status = hsaKmtCreateQueueExt(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, 0, ring_buf_, ring_buf_alloc_bytes_, NULL, &queue_rsrc); } if (kmt_status != HSAKMT_STATUS_SUCCESS) diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp index cb8dff0f55..723054b318 100644 --- a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -133,7 +133,7 @@ BlitSdma::~BlitSdma() template hsa_status_t BlitSdma::Initialize( - const core::Agent& agent, bool use_xgmi, size_t linear_copy_size_override) { + const core::Agent& agent, bool use_xgmi, size_t linear_copy_size_override, int rec_eng) { if (queue_start_addr_ != NULL) { // Already initialized. return HSA_STATUS_SUCCESS; @@ -191,10 +191,12 @@ hsa_status_t BlitSdma: // device. ROCr creates queues that are of two kinds: PCIe optimized // and xGMI optimized. Which queue to create is indicated via input // boolean flag - const HSA_QUEUE_TYPE kQueueType_ = use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA; - if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(agent_->node_id(), kQueueType_, 100, - HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_, - kQueueSize, NULL, &queue_resource_)) { + const HSA_QUEUE_TYPE kQueueType_ = rec_eng >= 0 ? HSA_QUEUE_SDMA_BY_ENG_ID : + (use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA); + if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueueExt(agent_->node_id(), kQueueType_, 100, + HSA_QUEUE_PRIORITY_MAXIMUM, rec_eng, + queue_start_addr_, kQueueSize, NULL, + &queue_resource_)) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 1b7fa8bc47..ceb11e8d8c 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -720,7 +720,7 @@ core::Queue* GpuAgent::CreateInterceptibleQueue(void (*callback)(hsa_status_t st return queue; } -core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) { +core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi, int rec_eng) { AMD::BlitSdmaBase* sdma; size_t copy_size_override = 0; const size_t copy_size_overrides[2] = {0x3fffff, 0x3fffffff}; @@ -754,7 +754,9 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) { core::Runtime::runtime_singleton_->flag().enable_sdma_copy_size_override(); if (copy_size_override_setting == Flag::SDMA_DISABLE) copy_size_override = 0; - if (sdma->Initialize(*this, use_xgmi, copy_size_override) != HSA_STATUS_SUCCESS) { + rec_eng = uses_rec_sdma_eng_id_mask_ || !use_xgmi ? rec_eng : -1; + + if (sdma->Initialize(*this, use_xgmi, copy_size_override, rec_eng) != HSA_STATUS_SUCCESS) { sdma->Destroy(*this); delete sdma; sdma = nullptr; @@ -801,7 +803,7 @@ void GpuAgent::InitDma() { queues_[QueuePCSampling].reset([queue_lambda, this]() { return queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM); }); // Decide which engine to use for blits. - auto blit_lambda = [this](bool use_xgmi, lazy_ptr& queue, bool isHostToDev) { + auto blit_lambda = [this](bool use_xgmi, lazy_ptr& queue, bool isHostToDev, uint32_t rec_eng) { Flag::SDMA_OVERRIDE sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma(); // User SDMA queues are unstable on gfx8 and unsupported on gfx1013. @@ -817,7 +819,7 @@ void GpuAgent::InitDma() { *blits_[BlitHostToDev]; } - auto ret = CreateBlitSdma(use_xgmi); + auto ret = CreateBlitSdma(use_xgmi, rec_eng); if (ret != nullptr) return ret; } @@ -857,14 +859,15 @@ void GpuAgent::InitDma() { return ret; }); blits_[BlitHostToDev].reset( - [blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly], true); }); + [blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly], true, 0); }); blits_[BlitDevToHost].reset( - [blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility], false); }); + [blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility], false, 1); }); // XGMI engines. for (uint32_t idx = DefaultBlitCount; idx < blit_cnt_; idx++) { + const int eng = idx - 1; blits_[idx].reset( - [blit_lambda, this]() { return blit_lambda(true, queues_[QueueUtility], false); }); + [blit_lambda, this, eng]() { return blit_lambda(true, queues_[QueueUtility], false, eng); }); } // GWS queues. @@ -941,6 +944,25 @@ void GpuAgent::RegisterGangPeer(core::Agent& peer, unsigned int max_bandwidth_fa gang_peers_info_[peer.public_handle().handle] = max_bandwidth_factor; } +// Assign direct peer recommended SDMA engine IDs to GPU +void GpuAgent::RegisterRecSdmaEngIdMaskPeer(core::Agent& peer, uint32_t rec_sdma_eng_id_mask) { + auto kfd_version = core::Runtime::runtime_singleton_->KfdVersion().version; + bool rec_eng_enabled = core::Runtime::runtime_singleton_->flag().enable_sdma_recommended_eng() != + Flag::SDMA_DISABLE; + + // Assume all recommended masks with single recommended engine (IsPowerOfTwo) + // will only support targeting that engine and will not gang. + // Also assume support is uniform for every device in the system. + uses_rec_sdma_eng_id_mask_ = (kfd_version.KernelInterfaceMajorVersion > 1 || + (kfd_version.KernelInterfaceMajorVersion == 1 && + kfd_version.KernelInterfaceMinorVersion >= 17)) && + isa_->GetMajorVersion() == 9 && isa_->GetMinorVersion() >= 4 && + IsPowerOfTwo(rec_sdma_eng_id_mask) && rec_eng_enabled; + + rec_sdma_eng_id_peers_info_[peer.public_handle().handle] = uses_rec_sdma_eng_id_mask_ ? + rec_sdma_eng_id_mask : 0; +} + // Destroy gang signal static bool GangCopyCompleteHandler(hsa_signal_value_t, void *arg ) { core::Signal *gang_signal = reinterpret_cast(arg); @@ -955,6 +977,13 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, size_t size, std::vector& dep_signals, core::Signal& out_signal) { + // Recommended SDMA engine copies only have gang factor 1 + uint32_t rec_sdma_eng = ffs(rec_sdma_eng_id_peers_info_[dst_agent.public_handle().handle]); + + if (rec_sdma_eng) + return DmaCopyOnEngine(dst, dst_agent, src, src_agent, size, + dep_signals, out_signal, rec_sdma_eng, false); + if (profiling_enabled()) { // Track the agent so we could translate the resulting timestamp to system // domain correctly. diff --git a/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/runtime/hsa-runtime/core/runtime/amd_topology.cpp index e595bffac7..dfe15a936e 100644 --- a/runtime/hsa-runtime/core/runtime/amd_topology.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_topology.cpp @@ -238,7 +238,7 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) { link_info.numa_distance = io_link.Weight; core::Runtime::runtime_singleton_->RegisterLinkInfo( - io_link.NodeFrom, io_link.NodeTo, io_link.Weight, link_info); + io_link.NodeFrom, io_link.NodeTo, io_link.Weight, io_link.RecSdmaEngIdMask, link_info); } } @@ -383,7 +383,7 @@ void BuildTopology() { uint32_t src_id = src_gpu->node_id(); for (auto& dst_gpu : core::Runtime::runtime_singleton_->gpu_agents()) { uint32_t dst_id = dst_gpu->node_id(); - uint32_t gang_factor = 1; + uint32_t gang_factor = 1, rec_sdma_eng_id_mask = 0; if (src_id != dst_id) { auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id); @@ -398,12 +398,15 @@ void BuildTopology() { else if (linfo.info.numa_distance == 15 && linfo.info.min_bandwidth) gang_factor = linfo.info.max_bandwidth/linfo.info.min_bandwidth; else gang_factor = 1; + + rec_sdma_eng_id_mask = linfo.rec_sdma_eng_id_mask; } } // Register all GPUs regardless of connection type to take advantage of easy // key-value lookup later on. ((AMD::GpuAgent*)src_gpu)->RegisterGangPeer(*dst_gpu, gang_factor); + ((AMD::GpuAgent*)src_gpu)->RegisterRecSdmaEngIdMaskPeer(*dst_gpu, rec_sdma_eng_id_mask); } } } diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index 92a5eb7912..95dc6ed5d8 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -268,10 +268,11 @@ void Runtime::SetLinkCount(size_t num_nodes) { } void Runtime::RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to, - uint32_t num_hop, + uint32_t num_hop, uint32_t rec_sdma_eng_id_mask, hsa_amd_memory_pool_link_info_t& link_info) { const uint32_t idx = GetIndexLinkInfo(node_id_from, node_id_to); link_matrix_[idx].num_hop = num_hop; + link_matrix_[idx].rec_sdma_eng_id_mask = rec_sdma_eng_id_mask; link_matrix_[idx].info = link_info; // Limit the number of hop to 1 since the runtime does not have enough diff --git a/runtime/hsa-runtime/core/util/flag.h b/runtime/hsa-runtime/core/util/flag.h index 16226b19db..5028e78dea 100644 --- a/runtime/hsa-runtime/core/util/flag.h +++ b/runtime/hsa-runtime/core/util/flag.h @@ -101,6 +101,10 @@ class Flag { enable_sdma_copy_size_override_ = (var == "0") ? SDMA_DISABLE : ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT); + var = os::GetEnvVar("HSA_ENABLE_SDMA_RECOMMENDED_ENG"); + enable_sdma_recommended_eng_ = (var == "0") ? SDMA_DISABLE : + ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT); + visible_gpus_ = os::GetEnvVar("ROCR_VISIBLE_DEVICES"); filter_visible_gpus_ = os::IsEnvVarSet("ROCR_VISIBLE_DEVICES"); @@ -288,6 +292,8 @@ class Flag { SDMA_OVERRIDE enable_sdma_copy_size_override() const { return enable_sdma_copy_size_override_; } + SDMA_OVERRIDE enable_sdma_recommended_eng() const { return enable_sdma_recommended_eng_; } + std::string visible_gpus() const { return visible_gpus_; } bool filter_visible_gpus() const { return filter_visible_gpus_; } @@ -384,6 +390,7 @@ class Flag { SDMA_OVERRIDE enable_peer_sdma_; SDMA_OVERRIDE enable_sdma_gang_; SDMA_OVERRIDE enable_sdma_copy_size_override_; + SDMA_OVERRIDE enable_sdma_recommended_eng_; bool filter_visible_gpus_; std::string visible_gpus_;