From a83f872a233c4944cf7cca526b656d68dfef12ca Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Tue, 5 Sep 2023 16:10:41 +0000 Subject: [PATCH] PC Sampling: Create dedicated CP queue Create dedicated CP queue with highest priority for PC Sampling. Reduce the highest priority that LRT's can set for existing API so that PC Sampling queue will always have highest priority over any other CP queues Change-Id: Ia70d74415edc83b4862a3e18dbdbd7cebe73ab47 --- runtime/hsa-runtime/core/inc/amd_gpu_agent.h | 5 +++-- .../core/runtime/amd_gpu_agent.cpp | 19 +++++++++++++++---- .../hsa-runtime/core/runtime/hsa_ext_amd.cpp | 5 ++++- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 868a08dbe8..bbbfeb0398 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -536,8 +536,9 @@ class GpuAgent : public GpuAgentInt { // @brief AQL queues for cache management and blit compute usage. enum QueueEnum { - QueueUtility, // Cache management and device to {host,device} blit compute - QueueBlitOnly, // Host to device blit + QueueUtility, // Cache management and device to {host,device} blit compute + QueueBlitOnly, // Host to device blit + QueuePCSampling, // Dedicated high priority queue for PC Sampling QueueCount }; diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 7b230ba736..8268d9a850 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -741,18 +741,29 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) { void GpuAgent::InitDma() { // Setup lazy init pointers on queues and blits. - auto queue_lambda = [this]() { - auto ret = CreateInterceptibleQueue(); - if (ret == nullptr) + auto queue_lambda = [this](HSA_QUEUE_PRIORITY priority = HSA_QUEUE_PRIORITY_NORMAL) { + auto queue = CreateInterceptibleQueue(); + if (queue == nullptr) throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Internal queue creation failed."); - return ret; + + if (priority != HSA_QUEUE_PRIORITY_NORMAL) + if (queue->SetPriority(priority) != HSA_STATUS_SUCCESS) + throw AMD::hsa_exception(HSA_STATUS_ERROR, + "Failed to increase queue priority for PC Sampling"); + return queue; }; + // Dedicated compute queue for host-to-device blits. queues_[QueueBlitOnly].reset(queue_lambda); // Share utility queue with device-to-host blits. queues_[QueueUtility].reset(queue_lambda); + // Dedicated compute queue for PC Sampling CP-DMA commands. We need a dedicated queue that runs at + // highest priority because we do not want the CP-DMA commands to be delayed/blocked due to + // other dispatches/barriers that could be in the other AQL queues. + queues_[QueuePCSampling].reset(queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM)); + // Decide which engine to use for blits. auto blit_lambda = [this](bool use_xgmi, lazy_ptr& queue, bool isHostToDev) { Flag::SDMA_OVERRIDE sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma(); diff --git a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index 7b56898520..f855ad339a 100644 --- a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -1072,10 +1072,13 @@ hsa_status_t hsa_amd_queue_set_priority(hsa_queue_t* queue, core::Queue* cmd_queue = core::Queue::Convert(queue); IS_VALID(cmd_queue); + // Highest queue priority allowed for HSA user is HSA_QUEUE_PRIORITY_HIGH + // HSA_QUEUE_PRIORITY_MAXIMUM is reserved for PC Sampling and can only be allocated internally + // in ROCR static std::map ext_kmt_priomap = { {HSA_AMD_QUEUE_PRIORITY_LOW, HSA_QUEUE_PRIORITY_MINIMUM}, {HSA_AMD_QUEUE_PRIORITY_NORMAL, HSA_QUEUE_PRIORITY_NORMAL}, - {HSA_AMD_QUEUE_PRIORITY_HIGH, HSA_QUEUE_PRIORITY_MAXIMUM}, + {HSA_AMD_QUEUE_PRIORITY_HIGH, HSA_QUEUE_PRIORITY_HIGH}, }; auto priority_it = ext_kmt_priomap.find(priority);