diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 868a08dbe8..bbbfeb0398 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -536,8 +536,9 @@ class GpuAgent : public GpuAgentInt { // @brief AQL queues for cache management and blit compute usage. enum QueueEnum { - QueueUtility, // Cache management and device to {host,device} blit compute - QueueBlitOnly, // Host to device blit + QueueUtility, // Cache management and device to {host,device} blit compute + QueueBlitOnly, // Host to device blit + QueuePCSampling, // Dedicated high priority queue for PC Sampling QueueCount }; diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 7b230ba736..8268d9a850 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -741,18 +741,29 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) { void GpuAgent::InitDma() { // Setup lazy init pointers on queues and blits. - auto queue_lambda = [this]() { - auto ret = CreateInterceptibleQueue(); - if (ret == nullptr) + auto queue_lambda = [this](HSA_QUEUE_PRIORITY priority = HSA_QUEUE_PRIORITY_NORMAL) { + auto queue = CreateInterceptibleQueue(); + if (queue == nullptr) throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Internal queue creation failed."); - return ret; + + if (priority != HSA_QUEUE_PRIORITY_NORMAL) + if (queue->SetPriority(priority) != HSA_STATUS_SUCCESS) + throw AMD::hsa_exception(HSA_STATUS_ERROR, + "Failed to increase queue priority for PC Sampling"); + return queue; }; + // Dedicated compute queue for host-to-device blits. queues_[QueueBlitOnly].reset(queue_lambda); // Share utility queue with device-to-host blits. queues_[QueueUtility].reset(queue_lambda); + // Dedicated compute queue for PC Sampling CP-DMA commands. We need a dedicated queue that runs at + // highest priority because we do not want the CP-DMA commands to be delayed/blocked due to + // other dispatches/barriers that could be in the other AQL queues. + queues_[QueuePCSampling].reset(queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM)); + // Decide which engine to use for blits. auto blit_lambda = [this](bool use_xgmi, lazy_ptr& queue, bool isHostToDev) { Flag::SDMA_OVERRIDE sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma(); diff --git a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index 7b56898520..f855ad339a 100644 --- a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -1072,10 +1072,13 @@ hsa_status_t hsa_amd_queue_set_priority(hsa_queue_t* queue, core::Queue* cmd_queue = core::Queue::Convert(queue); IS_VALID(cmd_queue); + // Highest queue priority allowed for HSA user is HSA_QUEUE_PRIORITY_HIGH + // HSA_QUEUE_PRIORITY_MAXIMUM is reserved for PC Sampling and can only be allocated internally + // in ROCR static std::map ext_kmt_priomap = { {HSA_AMD_QUEUE_PRIORITY_LOW, HSA_QUEUE_PRIORITY_MINIMUM}, {HSA_AMD_QUEUE_PRIORITY_NORMAL, HSA_QUEUE_PRIORITY_NORMAL}, - {HSA_AMD_QUEUE_PRIORITY_HIGH, HSA_QUEUE_PRIORITY_MAXIMUM}, + {HSA_AMD_QUEUE_PRIORITY_HIGH, HSA_QUEUE_PRIORITY_HIGH}, }; auto priority_it = ext_kmt_priomap.find(priority);