SWDEV-561708 Initial shared queue pool apis (#1614)

* SWDEV-561708 Initial shared queue pool apis

* Validate params; some fixes in callback function (but still needs to be checked)

* Dtor cleanup

* minor

* Enable profiling; remove callback since aql_queue takes care of it

* setPriority and setCuMask APIs updated for counted queues

* Increasing step and minor version for rocprofiler

* Tests for CountedQueueManager

* tests

* Code refactored to make pool manager part of GpuAgent only (incomplete); unique handles issue pending

* Refactored code to support CQM inside GpuAgent and unique handles; multithreaded test added

* Changed to ASSERT_SUCCESS macros for all tests

* RIng buffer overflow test added

* tests fixed; cleanup added at hsa_shutdown

* priority conversion table changes

* Compiler warnings fixed

* Rewrite 1 test; add desc and improve SetUp() code

* Improvement

* Unififed getinfo for both counted and non-counted queues

* Address PR feedback

* Addressing feedback: memleak, data type mismatch, documentation

* improve comment

* format

* Missing HSA_API macros for roctracer

* Revert "Addressing feedback: memleak, data type mismatch, documentation"

This reverts commit 5e498a55fb3640e00d06cec63dcec79293fb23de.

* Improving acquire api doc

* release api doc improved

* error codes for release api doc
This commit is contained in:
pghoshamd
2026-01-21 15:30:04 -05:00
committed by GitHub
orang tua f1b313780b
melakukan 793755532f
36 mengubah file dengan 1545 tambahan dan 58 penghapusan
@@ -0,0 +1,855 @@
/*
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
*
* SPDX-License-Identifier: MIT
*/
#include <thread>
#include <mutex>
#include <atomic>
#include <unordered_map>
#include <algorithm>
#include "suites/functional/counted_queues.h"
#include "hsa/hsa_ext_amd.h"
#include "hsa/hsa.h"
#include "common/base_rocr_utils.h"
#include "gtest/gtest.h"
#include "common/os.h"
static bool VerifyResult(uint32_t* ar, size_t sz) {
for (size_t i = 0; i < sz; ++i) {
if (i * i != ar[i]) {
return false;
}
}
return true;
}
CountedQueuesTest::CountedQueuesTest() : TestBase() {
set_title("RocR Counted Queues Test");
set_description(
"This test validates the behavior of Shared Counted Queues managed by the "
"Counted Queue Manager in a scenario where different libraries use CP "
"Queues and it avoids oversubscription and a subsequent performance degradation.");
}
CountedQueuesTest::~CountedQueuesTest() {}
void CountedQueuesTest::SetUp() {
const std::string kDefaultLimit = "2";
static const std::unordered_map<std::string, std::string> kQueueLimits = {
{"Counted_Queue_Multithreaded_Dispatch_Test", "1"},
{"Counted_Queue_Overflow_And_Wraparound_Test", "1"},
{"Counted_Queue_Same_Priority_Max_Limit_Test", "4"}};
const ::testing::TestInfo* test_info = ::testing::UnitTest::GetInstance()->current_test_info();
if (test_info) {
const std::string test_name = test_info->name();
// Find the current test's required limit from map and set the env var
// Set default HW queue limit if not found in map
auto it = kQueueLimits.find(test_name);
const std::string& limit = (it != kQueueLimits.end()) ? it->second : kDefaultLimit;
rocrtst::SetEnv("GPU_MAX_HW_QUEUES", limit.c_str());
}
TestBase::SetUp();
}
void CountedQueuesTest::Run() {
// Compare required profile for this test case with what we're actually
// running on
if (!rocrtst::CheckProfile(this)) {
return;
}
TestBase::Run();
}
void CountedQueuesTest::Close() {
// This will close handles opened within rocrtst utility calls and call
// hsa_shut_down(), so it should be done after other hsa cleanup
TestBase::Close();
}
void CountedQueuesTest::DisplayResults() const {
// Compare required profile for this test case with what we're actually
// running on
if (!rocrtst::CheckProfile(this)) {
return;
}
TestBase::DisplayResults();
}
void CountedQueuesTest::DisplayTestInfo() { TestBase::DisplayTestInfo(); }
void CountedQueuesTest::CountedQueueBasicApiTest() {
// Find all gpu agents
std::vector<hsa_agent_t> gpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
hsa_queue_t* queue = nullptr;
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, &queue));
ASSERT_NE(queue, nullptr);
// Query counted queue and check internal reference count
int32_t use_count = 0;
ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count));
EXPECT_EQ(use_count, 1); // should be 1 after acquire
// Release the queue
ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
// Check that ref count is back to 0 after release
hsa_status_t status;
status = hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count);
ASSERT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
}
void CountedQueuesTest::CountedQueues_SamePriority_MaxLimitTest() {
hsa_status_t status;
// Find all GPU agents
std::vector<hsa_agent_t> gpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
ASSERT_FALSE(gpus.empty());
const int NUM_QUEUES = 50;
const int MAX_HW_QUEUES = std::stoi(rocrtst::GetEnv("GPU_MAX_HW_QUEUES"));
std::vector<hsa_queue_t*> queues(NUM_QUEUES, nullptr);
std::vector<uint32_t> hw_ids(NUM_QUEUES, 0);
// Acquire NUM_QUEUES counted queues
for (int i = 0; i < NUM_QUEUES; i++) {
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI,
HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
&queues[i]));
ASSERT_NE(queues[i], nullptr);
}
// Query HW IDs
for (int i = 0; i < NUM_QUEUES; i++) {
ASSERT_SUCCESS(hsa_amd_queue_get_info(queues[i], HSA_QUEUE_INFO_HW_ID, &hw_ids[i]));
}
// Sort and remove duplicate HW IDs
std::sort(hw_ids.begin(), hw_ids.end());
auto it = std::unique(hw_ids.begin(), hw_ids.end());
hw_ids.resize(std::distance(hw_ids.begin(), it));
// Ensure hardware queue count matches MAX_HW_QUEUES
ASSERT_EQ(hw_ids.size(), MAX_HW_QUEUES);
// Verify even distribution of logical queues over HW queues
// Map HW ID -> use count
std::unordered_map<uint32_t, uint32_t> use_counts;
for (auto* q : queues) {
uint32_t hwid = 0, count = 0;
ASSERT_SUCCESS(hsa_amd_queue_get_info(q, HSA_QUEUE_INFO_HW_ID, &hwid));
ASSERT_SUCCESS(hsa_amd_queue_get_info(q, HSA_QUEUE_INFO_USE_COUNT, &count));
use_counts[hwid] = count; // overwrites but counts are per-hw, same across queues
}
// Gather all use-counts for fairness check
std::vector<uint32_t> dist;
for (auto& kv : use_counts) {
dist.push_back(kv.second);
}
ASSERT_EQ(dist.size(), MAX_HW_QUEUES);
// Fair distribution: difference should not exceed 1
auto [min_it, max_it] = std::minmax_element(dist.begin(), dist.end());
EXPECT_LE(*max_it - *min_it, 1);
// Release queues
for (auto* q : queues) {
ASSERT_SUCCESS(hsa_amd_counted_queue_release(q));
}
// After release, querying use-count should return invalid argument
for (auto* q : queues) {
uint32_t tmp = 0;
EXPECT_EQ(hsa_amd_queue_get_info(q, HSA_QUEUE_INFO_USE_COUNT, &tmp),
HSA_STATUS_ERROR_INVALID_ARGUMENT);
}
}
void CountedQueuesTest::InvalidArgsTest() {
hsa_status_t status;
hsa_queue_t* q = nullptr;
// Find all gpu agents
std::vector<hsa_agent_t> gpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
// Invalid queue pointer
status = hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW,
nullptr, nullptr, 0, nullptr);
EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
// Invalid priority
const hsa_amd_queue_priority_t invalid_priority = static_cast<hsa_amd_queue_priority_t>(999);
status = hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI, invalid_priority, nullptr,
nullptr, 0, &q);
EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
// Support multi producer queues only
status = hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_SINGLE, HSA_AMD_QUEUE_PRIORITY_LOW,
nullptr, nullptr, 0, &q);
EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_QUEUE_CREATION);
// Check release API params
hsa_queue_t* queue = nullptr;
status = hsa_amd_counted_queue_release(queue);
EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
}
void CountedQueuesTest::CountedQueuesAllPrioritiesLimitTest() {
hsa_status_t status;
// Find all gpu agents
std::vector<hsa_agent_t> gpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
// Acquire 2 queues per priority (total 6 queues)
hsa_queue_t *low1 = nullptr, *low2 = nullptr, *low3 = nullptr;
hsa_queue_t *normal1 = nullptr, *normal2 = nullptr, *normal3 = nullptr;
hsa_queue_t *high1 = nullptr, *high2 = nullptr, *high3 = nullptr;
// Low Priority
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &low1));
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &low2));
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI,
HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
&low3)); // should reuse low1
// Normal Priority
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, &normal1));
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, &normal2));
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI,
HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0,
&normal3)); // should reuse normal1
// High Priority
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_HIGH, nullptr, nullptr, 0, &high1));
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_HIGH, nullptr, nullptr, 0, &high2));
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_HIGH, nullptr, nullptr, 0, &high3));
// Verify reuse and independence per priority
uint32_t low_id1 = 0, low_id2 = 0, low_id3 = 0;
uint32_t norm_id1 = 0, norm_id2 = 0, norm_id3 = 0;
uint32_t high_id1 = 0, high_id2 = 0, high_id3 = 0;
ASSERT_SUCCESS(hsa_amd_queue_get_info(low1, HSA_QUEUE_INFO_HW_ID, &low_id1));
ASSERT_SUCCESS(hsa_amd_queue_get_info(low2, HSA_QUEUE_INFO_HW_ID, &low_id2));
ASSERT_SUCCESS(hsa_amd_queue_get_info(low3, HSA_QUEUE_INFO_HW_ID, &low_id3));
ASSERT_SUCCESS(hsa_amd_queue_get_info(normal1, HSA_QUEUE_INFO_HW_ID, &norm_id1));
ASSERT_SUCCESS(hsa_amd_queue_get_info(normal2, HSA_QUEUE_INFO_HW_ID, &norm_id2));
ASSERT_SUCCESS(hsa_amd_queue_get_info(normal3, HSA_QUEUE_INFO_HW_ID, &norm_id3));
ASSERT_SUCCESS(hsa_amd_queue_get_info(high1, HSA_QUEUE_INFO_HW_ID, &high_id1));
ASSERT_SUCCESS(hsa_amd_queue_get_info(high2, HSA_QUEUE_INFO_HW_ID, &high_id2));
ASSERT_SUCCESS(hsa_amd_queue_get_info(high3, HSA_QUEUE_INFO_HW_ID, &high_id3));
// Within same priority: max 2 unique HW queues
EXPECT_NE(low_id1, low_id2);
EXPECT_TRUE(low_id3 == low_id1);
EXPECT_NE(norm_id1, norm_id2);
EXPECT_TRUE(norm_id3 == norm_id1);
EXPECT_NE(high_id1, high_id2);
EXPECT_TRUE(high_id3 == high_id1);
// Ensure different queues are used across priorities
EXPECT_NE(low_id1, norm_id1);
EXPECT_NE(norm_id1, high_id1);
EXPECT_NE(low_id1, high_id1);
// Verify use counts of first two HW queues
uint32_t low_use1 = 0, low_use2 = 0, low_use3 = 0;
uint32_t norm_use1 = 0, norm_use2 = 0, norm_use3 = 0;
uint32_t high_use1 = 0, high_use2 = 0, high_use3 = 0;
ASSERT_SUCCESS(hsa_amd_queue_get_info(low1, HSA_QUEUE_INFO_USE_COUNT, &low_use1));
ASSERT_SUCCESS(hsa_amd_queue_get_info(low2, HSA_QUEUE_INFO_USE_COUNT, &low_use2));
ASSERT_SUCCESS(hsa_amd_queue_get_info(low3, HSA_QUEUE_INFO_USE_COUNT, &low_use3));
ASSERT_SUCCESS(hsa_amd_queue_get_info(normal1, HSA_QUEUE_INFO_USE_COUNT, &norm_use1));
ASSERT_SUCCESS(hsa_amd_queue_get_info(normal2, HSA_QUEUE_INFO_USE_COUNT, &norm_use2));
ASSERT_SUCCESS(hsa_amd_queue_get_info(normal3, HSA_QUEUE_INFO_USE_COUNT, &norm_use3));
ASSERT_SUCCESS(hsa_amd_queue_get_info(high1, HSA_QUEUE_INFO_USE_COUNT, &high_use1));
ASSERT_SUCCESS(hsa_amd_queue_get_info(high2, HSA_QUEUE_INFO_USE_COUNT, &high_use2));
ASSERT_SUCCESS(hsa_amd_queue_get_info(high3, HSA_QUEUE_INFO_USE_COUNT, &high_use3));
EXPECT_EQ(low_use1, 2);
EXPECT_EQ(low_use2, 1);
EXPECT_TRUE(low_use1 == low_use3); // same HW queues, same ref count
EXPECT_EQ(norm_use1, 2);
EXPECT_EQ(norm_use2, 1);
EXPECT_TRUE(norm_use1 == norm_use3);
EXPECT_EQ(high_use1, 2);
EXPECT_EQ(high_use2, 1);
EXPECT_TRUE(high_use1 == high_use3);
// Release all queues
ASSERT_SUCCESS(hsa_amd_counted_queue_release(low1));
ASSERT_SUCCESS(hsa_amd_counted_queue_release(low2));
ASSERT_SUCCESS(hsa_amd_counted_queue_release(low3));
ASSERT_SUCCESS(hsa_amd_counted_queue_release(normal1));
ASSERT_SUCCESS(hsa_amd_counted_queue_release(normal2));
ASSERT_SUCCESS(hsa_amd_counted_queue_release(normal3));
ASSERT_SUCCESS(hsa_amd_counted_queue_release(high1));
ASSERT_SUCCESS(hsa_amd_counted_queue_release(high2));
ASSERT_SUCCESS(hsa_amd_counted_queue_release(high3));
}
void CountedQueuesTest::CountedQueuesSetPriorityNackTest() {
hsa_status_t status;
// Find all gpu agents
std::vector<hsa_agent_t> gpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
// Create a counted queue
hsa_queue_t* queue = nullptr;
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &queue));
EXPECT_NE(queue, nullptr);
// Try to set priority on this queue; should fail
status = hsa_amd_queue_set_priority(queue, HSA_AMD_QUEUE_PRIORITY_HIGH);
EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_QUEUE);
// release queue
ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
}
void CountedQueuesTest::CountedQueuesSetCUMaskNackTest() {
hsa_status_t status;
// Find all gpu agents
std::vector<hsa_agent_t> gpus;
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
// Create a counted queue
hsa_queue_t* queue = nullptr;
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &queue));
EXPECT_NE(queue, nullptr);
// Attempt to set CU mask on counted queue; should fail
uint32_t cu_mask[32] = {0}; // dummy mask
status = hsa_amd_queue_cu_set_mask(queue, 1, cu_mask);
EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_QUEUE);
// release queue
ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
}
void CountedQueuesTest::CountedQueuesDispatchTest() {
hsa_status_t status;
// Common setup
ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this));
ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this));
// Load kernel
set_kernel_file_name("test_case_template_kernels.hsaco");
set_kernel_name("square");
ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, gpu_device1()));
hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()};
// Allocate source buffer
void* src_buffer = nullptr;
ASSERT_SUCCESS(hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &src_buffer));
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, src_buffer));
// Initialize source data
for (uint32_t i = 0; i < 256; ++i) {
reinterpret_cast<uint32_t*>(src_buffer)[i] = i;
}
// Allocate destination buffer
void* dst_buffer = nullptr;
ASSERT_SUCCESS(hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &dst_buffer));
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, dst_buffer));
// Create completion signal
hsa_signal_t completion_signal;
ASSERT_SUCCESS(hsa_signal_create(1, 0, nullptr, &completion_signal));
// Get a counted queue
hsa_queue_t* queue = nullptr;
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(*gpu_device1(), HSA_QUEUE_TYPE_MULTI,
HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
&queue));
EXPECT_NE(queue, nullptr);
// Query queue info
int32_t use_count = 0;
uint32_t hw_id = 0;
ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count));
ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_HW_ID, &hw_id));
EXPECT_EQ(use_count, 1);
// Prepare kernel arguments
struct __attribute__((aligned(16))) local_args_t {
uint32_t* dstArray;
uint32_t* srcArray;
uint32_t size;
uint32_t pad;
uint64_t global_offset_x;
uint64_t global_offset_y;
uint64_t global_offset_z;
uint64_t printf_buffer;
uint64_t default_queue;
uint64_t completion_action;
} local_args;
local_args.dstArray = reinterpret_cast<uint32_t*>(dst_buffer);
local_args.srcArray = reinterpret_cast<uint32_t*>(src_buffer);
local_args.size = 256;
local_args.global_offset_x = 0;
local_args.global_offset_y = 0;
local_args.global_offset_z = 0;
local_args.printf_buffer = 0;
local_args.default_queue = 0;
local_args.completion_action = 0;
// Allocate kernel arguments
void* kernarg_address = nullptr;
ASSERT_SUCCESS(
hsa_amd_memory_pool_allocate(kern_arg_pool(), sizeof(local_args), 0, &kernarg_address));
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, kernarg_address));
memcpy(kernarg_address, &local_args, sizeof(local_args));
// Dispatch loop
int it = num_iteration() * 5;
const uint32_t queue_mask = queue->size - 1;
for (int i = 0; i < it; i++) {
// Reserve a slot in the queue
uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1);
// Get pointer to the reserved packet slot
hsa_kernel_dispatch_packet_t* queue_aql_packet =
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address))[index & queue_mask];
// Fill packet fields
queue_aql_packet->setup = 1;
queue_aql_packet->workgroup_size_x = 256;
queue_aql_packet->workgroup_size_y = 1;
queue_aql_packet->workgroup_size_z = 1;
queue_aql_packet->grid_size_x = 256;
queue_aql_packet->grid_size_y = 1;
queue_aql_packet->grid_size_z = 1;
queue_aql_packet->private_segment_size = 0;
queue_aql_packet->group_segment_size = 0;
queue_aql_packet->kernel_object = kernel_object();
queue_aql_packet->kernarg_address = kernarg_address;
queue_aql_packet->completion_signal = completion_signal;
// Write header for packet
uint32_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
__atomic_store_n(reinterpret_cast<uint16_t*>(&queue_aql_packet->header), header,
__ATOMIC_RELEASE);
// Ring doorbell to notify GPU
hsa_signal_store_screlease(queue->doorbell_signal, index);
// Wait for completion signal
while (hsa_signal_wait_scacquire(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
HSA_WAIT_STATE_ACTIVE)) {
}
// Reset signal for next iteration
hsa_signal_store_screlease(completion_signal, 1);
// Verify results
ASSERT_TRUE(VerifyResult(reinterpret_cast<uint32_t*>(dst_buffer), 256));
}
// Verify use count before release
ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count));
EXPECT_EQ(use_count, 1);
// Release the counted queue
ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
// Verify queue info returns error after release
status = hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count);
ASSERT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
// Cleanup
ASSERT_SUCCESS(hsa_amd_memory_pool_free(kernarg_address));
ASSERT_SUCCESS(hsa_signal_destroy(completion_signal));
ASSERT_SUCCESS(hsa_amd_memory_pool_free(src_buffer));
ASSERT_SUCCESS(hsa_amd_memory_pool_free(dst_buffer));
}
void CountedQueuesTest::CountedQueuesMultithreadedDispatchTest() {
hsa_status_t status;
// Common setup
ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this));
ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this));
// Load kernel
set_kernel_file_name("test_case_template_kernels.hsaco");
set_kernel_name("square");
ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, gpu_device1()));
hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()};
// Shared source buffer (read-only)
void* shared_src_buffer = nullptr;
ASSERT_SUCCESS(
hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &shared_src_buffer));
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, shared_src_buffer));
// Initialize source data
for (uint32_t i = 0; i < 256; ++i) {
reinterpret_cast<uint32_t*>(shared_src_buffer)[i] = i;
}
// Structures for validation later on
std::mutex hwIdsMutex;
std::vector<uint32_t> allHwIds;
std::atomic<int32_t> maxUseCount{0};
auto func = [&]() {
// local dest buffer for each user application
void* local_dst_buffer = nullptr;
ASSERT_SUCCESS(
hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &local_dst_buffer));
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, local_dst_buffer));
// Local completion signal for every user application
hsa_signal_t local_signal;
ASSERT_SUCCESS(hsa_signal_create(1, 0, nullptr, &local_signal));
// Get a counted queue
hsa_queue_t* queue = nullptr;
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(*gpu_device1(), HSA_QUEUE_TYPE_MULTI,
HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
&queue));
EXPECT_NE(queue, nullptr);
if (queue == nullptr) {
hsa_signal_destroy(local_signal);
hsa_amd_memory_pool_free(local_dst_buffer);
return;
}
// Store query results for later analysis
int32_t localUseCount = 0;
uint32_t localHwId = 0;
ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &localUseCount));
ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_HW_ID, &localHwId));
// Update use_count if it is larger than previous value
int expected = maxUseCount.load();
while (localUseCount > expected &&
!maxUseCount.compare_exchange_weak(expected, localUseCount)) {
}
// Store hw id for validation later on
{
std::lock_guard<std::mutex> lock(hwIdsMutex);
allHwIds.push_back(localHwId);
}
struct __attribute__((aligned(16))) local_args_t {
uint32_t* dstArray;
uint32_t* srcArray;
uint32_t size;
uint32_t pad;
uint64_t global_offset_x;
uint64_t global_offset_y;
uint64_t global_offset_z;
uint64_t printf_buffer;
uint64_t default_queue;
uint64_t completion_action;
} local_args;
local_args.dstArray = reinterpret_cast<uint32_t*>(local_dst_buffer);
local_args.srcArray = reinterpret_cast<uint32_t*>(shared_src_buffer);
local_args.size = 256;
local_args.global_offset_x = 0;
local_args.global_offset_y = 0;
local_args.global_offset_z = 0;
local_args.printf_buffer = 0;
local_args.default_queue = 0;
local_args.completion_action = 0;
void* kernarg_address = nullptr;
ASSERT_SUCCESS(
hsa_amd_memory_pool_allocate(kern_arg_pool(), sizeof(local_args), 0, &kernarg_address));
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, kernarg_address));
memcpy(kernarg_address, &local_args, sizeof(local_args));
// Dispatch loop
int it = num_iteration() * 5;
const uint32_t queue_mask = queue->size - 1;
for (int i = 0; i < it; i++) {
// Reserve a slot in the queue
uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1);
// Get pointer to the reserved packet slot and validate address
hsa_kernel_dispatch_packet_t* queue_aql_packet = &(
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address))[index & queue_mask];
ASSERT_EQ(queue_aql_packet,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address) + (index & queue_mask));
// Fill packet fields
queue_aql_packet->setup = 1;
queue_aql_packet->workgroup_size_x = 256;
queue_aql_packet->workgroup_size_y = 1;
queue_aql_packet->workgroup_size_z = 1;
queue_aql_packet->grid_size_x = 256;
queue_aql_packet->grid_size_y = 1;
queue_aql_packet->grid_size_z = 1;
queue_aql_packet->private_segment_size = 0;
queue_aql_packet->group_segment_size = 0;
queue_aql_packet->kernel_object = kernel_object();
queue_aql_packet->kernarg_address = kernarg_address;
queue_aql_packet->completion_signal = local_signal;
// Write header for packet
uint32_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
__atomic_store_n(reinterpret_cast<uint16_t*>(&queue_aql_packet->header), header,
__ATOMIC_RELEASE);
// Ring doorbell to notify GPU
hsa_signal_store_screlease(queue->doorbell_signal, index);
// Wait for completion signal to be less than 1
while (hsa_signal_wait_scacquire(local_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
HSA_WAIT_STATE_ACTIVE)) {
}
// Reset signal for next iteration
hsa_signal_store_screlease(local_signal, 1);
ASSERT_TRUE(VerifyResult(reinterpret_cast<uint32_t*>(local_dst_buffer), 256));
}
// Cleanup
hsa_amd_memory_pool_free(kernarg_address);
hsa_signal_destroy(local_signal);
hsa_amd_memory_pool_free(local_dst_buffer);
// Release the counted queue
ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
};
constexpr int kThreads = 2;
std::vector<std::thread> threads;
for (int i = 0; i < kThreads; i++) {
threads.emplace_back(func);
}
for (auto& th : threads) {
th.join();
}
// With GPU_MAX_HW_QUEUES=1, all threads should share the same HW queue
// Check if largest useCount is same as the number of user apps accessing queues
EXPECT_EQ(maxUseCount.load(), kThreads);
// All HW IDs should be the same (only 1 HW queue created)
EXPECT_EQ(allHwIds.size(), static_cast<size_t>(kThreads));
for (size_t i = 1; i < allHwIds.size(); i++) {
EXPECT_EQ(allHwIds[i], allHwIds[0]);
}
hsa_amd_memory_pool_free(shared_src_buffer);
}
void CountedQueuesTest::CountedQueuesOverflowWrapAroundTest() {
hsa_status_t status;
// Common setup
ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this));
ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this));
// Load kernel
set_kernel_file_name("test_case_template_kernels.hsaco");
set_kernel_name("square");
ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, gpu_device1()));
hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()};
void* shared_src_buffer = nullptr;
ASSERT_SUCCESS(
hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &shared_src_buffer));
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, shared_src_buffer));
for (uint32_t i = 0; i < 256; ++i) {
reinterpret_cast<uint32_t*>(shared_src_buffer)[i] = i;
}
// To verify that after the queue has been used up, next index wraps around
std::atomic<uint64_t> maxIndexSeen{0};
auto func = [&]() {
// local dest buffer for each user application
void* local_dst_buffer = nullptr;
ASSERT_SUCCESS(
hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &local_dst_buffer));
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, local_dst_buffer));
// Local completion signal for every user application
hsa_signal_t local_signal;
ASSERT_SUCCESS(hsa_signal_create(1, 0, nullptr, &local_signal));
// Get a counted queue
hsa_queue_t* queue = nullptr;
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(*gpu_device1(), HSA_QUEUE_TYPE_MULTI,
HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
&queue));
EXPECT_NE(queue, nullptr);
if (queue == nullptr) {
hsa_signal_destroy(local_signal);
hsa_amd_memory_pool_free(local_dst_buffer);
return;
}
uint32_t queue_size = queue->size; // should be 16384
const uint32_t queue_mask = queue_size - 1; // used for index wraparound
struct __attribute__((aligned(16))) local_args_t {
uint32_t* dstArray;
uint32_t* srcArray;
uint32_t size;
uint32_t pad;
uint64_t global_offset_x;
uint64_t global_offset_y;
uint64_t global_offset_z;
uint64_t printf_buffer;
uint64_t default_queue;
uint64_t completion_action;
} local_args;
local_args.dstArray = reinterpret_cast<uint32_t*>(local_dst_buffer);
local_args.srcArray = reinterpret_cast<uint32_t*>(shared_src_buffer);
local_args.size = 256;
local_args.global_offset_x = 0;
local_args.global_offset_y = 0;
local_args.global_offset_z = 0;
local_args.printf_buffer = 0;
local_args.default_queue = 0;
local_args.completion_action = 0;
void* kernarg_address = nullptr;
ASSERT_SUCCESS(
hsa_amd_memory_pool_allocate(kern_arg_pool(), sizeof(local_args), 0, &kernarg_address));
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, kernarg_address));
memcpy(kernarg_address, &local_args, sizeof(local_args));
// Dispatch more packets than queue size to force overflow and ensure that indices wrap around
int it = queue_size + 5;
for (int i = 0; i < it; i++) {
// Reserve a slot in the queue
uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1);
uint64_t curr_max = maxIndexSeen.load();
while (index > curr_max && !maxIndexSeen.compare_exchange_weak(curr_max, index)) {
}
// Get pointer to the reserved packet slot using wraparound masking
uint64_t wrapped_index = index & queue_mask;
hsa_kernel_dispatch_packet_t* queue_aql_packet =
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address))[wrapped_index];
// Fill packet fields
queue_aql_packet->setup = 1;
queue_aql_packet->workgroup_size_x = 256;
queue_aql_packet->workgroup_size_y = 1;
queue_aql_packet->workgroup_size_z = 1;
queue_aql_packet->grid_size_x = 256;
queue_aql_packet->grid_size_y = 1;
queue_aql_packet->grid_size_z = 1;
queue_aql_packet->private_segment_size = 0;
queue_aql_packet->group_segment_size = 0;
queue_aql_packet->kernel_object = kernel_object();
queue_aql_packet->kernarg_address = kernarg_address;
queue_aql_packet->completion_signal = local_signal;
// Write header for packet
uint32_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
__atomic_store_n(reinterpret_cast<uint16_t*>(&queue_aql_packet->header), header,
__ATOMIC_RELEASE);
// Ring doorbell to notify GPU
hsa_signal_store_screlease(queue->doorbell_signal, index);
// Wait for completion signal to be less than 1
while (hsa_signal_wait_scacquire(local_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
HSA_WAIT_STATE_ACTIVE)) {
}
// Reset signal for next iteration
hsa_signal_store_screlease(local_signal, 1);
// Verify results are still correct after wraparound
ASSERT_TRUE(VerifyResult(reinterpret_cast<uint32_t*>(local_dst_buffer), 256));
}
// Cleanup
hsa_amd_memory_pool_free(kernarg_address);
hsa_signal_destroy(local_signal);
hsa_amd_memory_pool_free(local_dst_buffer);
// Release the counted queue
ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
};
constexpr int kThreads = 2;
std::vector<std::thread> threads;
for (int i = 0; i < kThreads; i++) {
threads.emplace_back(func);
}
for (auto& th : threads) {
th.join();
}
// Verify value of max seen index
uint64_t maxId = maxIndexSeen.load();
EXPECT_EQ(maxId, (16384 + 5) * kThreads - 1);
hsa_amd_memory_pool_free(shared_src_buffer);
}
@@ -0,0 +1,68 @@
/*
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
*
* SPDX-License-Identifier: MIT
*/
#ifndef ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H
#define ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H
#include "suites/test_common/test_base.h"
class CountedQueuesTest : public TestBase {
public:
explicit CountedQueuesTest();
// @Brief: Destructor for test case of CountedQueuesTest
virtual ~CountedQueuesTest();
// @Brief: Setup the environment for measurement
virtual void SetUp();
// @Brief: Core measurement execution
virtual void Run();
// @Brief: Clean up and retrive the resource
virtual void Close();
// @Brief: Display results
virtual void DisplayResults() const;
// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);
/// @brief Basic API test to acquire, query and release 1 HW queue
void CountedQueueBasicApiTest();
/// @brief This test verifies that when many queues of the same priority are created,
// they are evenly distributed across the limited set of hardware queues, reuse those
// hardware queues correctly, and report proper use-counts before and after release
void CountedQueues_SamePriority_MaxLimitTest();
// @brief Test to verify HSA status codes for incorrect arguments sent via API
void InvalidArgsTest();
// @brief Test to verify that counted queues across all priorities each reuse only
// their own priority-specific hardware queues
void CountedQueuesAllPrioritiesLimitTest();
/// @brief Test to verify hsa_amd_queue_set_priority() does not work on counted queues
void CountedQueuesSetPriorityNackTest();
/// @brief Test to verify hsa_amd_queue_cu_set_mask() does not work on counted queues
void CountedQueuesSetCUMaskNackTest();
/// @brief Test to verify that a counted queue correctly supports kernel dispatches end-to-end
void CountedQueuesDispatchTest();
/// @brief Test to verify kernel dispatches onto shared queues from multiple user apps even when they
// all share the same HW queue
void CountedQueuesMultithreadedDispatchTest();
/// @brief Test to verify ring buffer wrap around when more than queue_size number of
// AQL packets are enqueued
void CountedQueuesOverflowWrapAroundTest();
};
#endif // ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H
@@ -82,6 +82,7 @@
#include "suites/functional/filter_devices.h"
#include "amd_smi/amdsmi.h"
#include "common/common.h"
#include "suites/functional/counted_queues.h"
#include "common/os.h"
static RocrTstGlobals *sRocrtstGlvalues = nullptr;
@@ -507,6 +508,69 @@ TEST(rocrtstFunc, Filter_Devices_Test) {
);
}
TEST(rocrtstFunc, Counted_Queue_Basic_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueueBasicApiTest();
RunCustomTestEpilog(&cq);
}
TEST(rocrtstFunc, Counted_Queue_Same_Priority_Max_Limit_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueues_SamePriority_MaxLimitTest();
RunCustomTestEpilog(&cq);
}
TEST(rocrtstFunc, Counted_Queue_Invalid_Args_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.InvalidArgsTest();
RunCustomTestEpilog(&cq);
}
TEST(rocrtstFunc, Counted_Queue_Multiple_Priorities_Limit_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueuesAllPrioritiesLimitTest();
RunCustomTestEpilog(&cq);
}
TEST(rocrtstFunc, Counted_Queue_Set_Priority_Nack_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueuesSetPriorityNackTest();
RunCustomTestEpilog(&cq);
}
TEST(rocrtstFunc, Counted_Queue_Set_CUMask_Nack_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueuesSetCUMaskNackTest();
RunCustomTestEpilog(&cq);
}
TEST(rocrtstFunc, Counted_Queue_Dispatch_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueuesDispatchTest();
RunCustomTestEpilog(&cq);
}
TEST(rocrtstFunc, Counted_Queue_Multithreaded_Dispatch_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueuesMultithreadedDispatchTest();
RunCustomTestEpilog(&cq);
}
TEST(rocrtstFunc, Counted_Queue_Overflow_And_Wraparound_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueuesOverflowWrapAroundTest();
RunCustomTestEpilog(&cq);
}
TEST(rocrtstNeg, Memory_Negative_Tests) {
RUN_IF_NOT_EMU_MODE(
MemoryAllocateNegativeTest mt;
@@ -54,6 +54,5 @@
test; \
}
#endif // ROCRTST_SUITES_TEST_COMMON_MAIN_H_
@@ -217,6 +217,7 @@ set ( SRCS core/driver/driver.cpp
core/runtime/cache.cpp
core/runtime/svm_profiler.cpp
core/runtime/thunk_loader.cpp
core/runtime/counted_queue_manager.cpp
core/common/hsa_table_interface.cpp
loader/executable.cpp
libamdhsacode/amd_elf_image.cpp
@@ -1345,6 +1345,18 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi
size_copied, status);
}
hsa_status_t HSA_API hsa_amd_counted_queue_acquire(
hsa_agent_t agent, hsa_queue_type_t type, hsa_amd_queue_priority_t priority,
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data,
uint64_t flags, hsa_queue_t** queue) {
return amdExtTable->hsa_amd_counted_queue_acquire_fn(agent, type, priority, callback, data, flags,
queue);
}
hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue) {
return amdExtTable->hsa_amd_counted_queue_release_fn(queue);
}
// Tools only table interfaces.
namespace rocr {
@@ -64,6 +64,28 @@
extern r_debug _amdgpu_r_debug;
namespace rocr {
/// @brief Mapping between priority type used internally within ROCR to the type used by KFD
// Highest queue priority allowed for HSA user is HSA_QUEUE_PRIORITY_HIGH
// HSA_QUEUE_PRIORITY_MAXIMUM is reserved for PC Sampling and can only be allocated internally
// in ROCR
__forceinline HSA_QUEUE_PRIORITY HsaInternalToKfdPriority(
rocr::HSA::hsa_amd_queue_priority_internal_t priority) {
switch (priority) {
case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_LOW:
return HSA_QUEUE_PRIORITY_MINIMUM;
case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL:
return HSA_QUEUE_PRIORITY_NORMAL;
case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_HIGH:
return HSA_QUEUE_PRIORITY_HIGH;
case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_MAXIMUM:
return HSA_QUEUE_PRIORITY_MAXIMUM;
default:
return HSA_QUEUE_PRIORITY_NORMAL;
}
}
namespace AMD {
#if defined(__linux__)
@@ -370,10 +392,13 @@ hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) {
}
hsa_status_t KfdDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id,
void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
HsaQueueResource& queue_resource) const {
if (HSAKMT_CALL(hsaKmtCreateQueueExt(node_id, type, queue_pct, priority, sdma_engine_id,
// Convert from ROCR internal priority type to KFD type
HSA_QUEUE_PRIORITY kfd_priority = HsaInternalToKfdPriority(priority);
if (HSAKMT_CALL(hsaKmtCreateQueueExt(node_id, type, queue_pct, kfd_priority, sdma_engine_id,
queue_addr, queue_size_bytes, event, &queue_resource)) !=
HSAKMT_STATUS_SUCCESS) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
@@ -389,9 +414,12 @@ hsa_status_t KfdDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
}
hsa_status_t KfdDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, void* queue_addr,
HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr,
uint64_t queue_size, HsaEvent* event) const {
if (HSAKMT_CALL(hsaKmtUpdateQueue(queue_id, queue_pct, priority, queue_addr, queue_size,
// Convert from ROCR internal priority type to KFD type
HSA_QUEUE_PRIORITY kfd_priority = HsaInternalToKfdPriority(priority);
if (HSAKMT_CALL(hsaKmtUpdateQueue(queue_id, queue_pct, kfd_priority, queue_addr, queue_size,
event)) != HSAKMT_STATUS_SUCCESS) {
return HSA_STATUS_ERROR;
}
@@ -430,7 +430,7 @@ hsa_status_t KfdVirtioDriver::MakeMemoryUnresident(const void* mem) const {
}
hsa_status_t KfdVirtioDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id,
void* queue_addr, uint64_t queue_size_bytes,
HsaEvent* event, HsaQueueResource& queue_resource) const {
if (vhsaKmtCreateQueueExt(node_id, type, queue_pct, priority, sdma_engine_id, queue_addr,
@@ -447,7 +447,7 @@ hsa_status_t KfdVirtioDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
}
hsa_status_t KfdVirtioDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_percentage,
HSA_QUEUE_PRIORITY priority, void* queue_mem,
HSA::hsa_amd_queue_priority_internal_t priority, void* queue_mem,
uint64_t queue_size, HsaEvent* event) const {
return HSA_STATUS_ERROR;
}
@@ -383,7 +383,7 @@ hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
}
hsa_status_t XdnaDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id,
void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
HsaQueueResource& queue_resource) const {
queue_resource.QueueId = AMDXDNA_INVALID_CTX_HANDLE;
@@ -407,7 +407,7 @@ hsa_status_t XdnaDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
}
hsa_status_t XdnaDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, void* queue_addr,
HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr,
uint64_t queue_size, HsaEvent* event) const {
// AIE doesn't support queue updates.
return HSA_STATUS_ERROR_INVALID_QUEUE;
@@ -73,7 +73,7 @@ class AieAqlQueue : public core::Queue,
~AieAqlQueue();
hsa_status_t Inactivate() override;
hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override;
hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override;
void Destroy() override;
uint64_t LoadReadIndexRelaxed() override;
uint64_t LoadReadIndexAcquire() override;
@@ -73,7 +73,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
hsa_status_t Inactivate() override;
/// @brief Change the scheduling priority of the queue
hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override;
hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override;
/// @brief Destroy ref counted queue
void Destroy() override;
@@ -316,7 +316,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
bool suspended_;
// Thunk dispatch and wavefront scheduling priority
HSA_QUEUE_PRIORITY priority_;
HSA::hsa_amd_queue_priority_internal_t priority_;
// Exception notification signal
Signal* exception_signal_;
@@ -62,6 +62,7 @@
#include "core/util/locks.h"
#include "core/util/small_heap.h"
#include "pcs/pcs_runtime.h"
#include "core/inc/counted_queue_manager.h"
namespace rocr {
namespace AMD {
@@ -342,6 +343,16 @@ class GpuAgent : public GpuAgentInt {
void AcquireQueueAltScratch(ScratchInfo& scratch) override;
void ReleaseQueueAltScratch(ScratchInfo& scratch) override;
// @brief Create a pool of shared queues for multiple user applications within a max limit
hsa_status_t AcquireCountedQueue(hsa_queue_type_t type,
HSA::hsa_amd_queue_priority_internal_t priority,
void (*callback)(hsa_status_t, hsa_queue_t*, void*),
void* data, uint64_t flags,
hsa_queue_t** out_queue);
// @brief Release a queue earlier used by application
hsa_status_t ReleaseCountedQueue(hsa_queue_t* queue);
// @brief Override from AMD::GpuAgentInt.
void TranslateTime(core::Signal* signal, hsa_amd_profiling_dispatch_time_t& time) override;
@@ -645,6 +656,9 @@ class GpuAgent : public GpuAgentInt {
// @brief HSA profile.
hsa_profile_t profile_;
// @brief Pool of shared queues owned by this agent
rocr::core::CountedQueuePoolManager queue_pool_;
void* trap_code_buf_;
size_t trap_code_buf_size_;
@@ -97,10 +97,10 @@ public:
uint32_t node_id) override;
hsa_status_t FreeMemory(void *mem, size_t size) override;
hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr,
uint64_t queue_size_bytes, HsaEvent* event,
HsaQueueResource& queue_resource) const override;
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority,
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA::hsa_amd_queue_priority_internal_t priority,
void* queue_addr, uint64_t queue_size, HsaEvent* event) const override;
hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
@@ -90,12 +90,12 @@ class KfdVirtioDriver final : public core::Driver {
const uint32_t* nodes) const override;
hsa_status_t MakeMemoryUnresident(const void* mem) const override;
hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr,
uint64_t queue_size_bytes, HsaEvent* event,
HsaQueueResource& queue_resource) const override;
hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_percentage,
HSA_QUEUE_PRIORITY priority, void* queue_mem, uint64_t queue_size,
HSA::hsa_amd_queue_priority_internal_t priority, void* queue_mem, uint64_t queue_size,
HsaEvent* event) const override;
hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t num_cu_mask,
uint32_t* cu_mask) const override;
@@ -204,10 +204,10 @@ public:
uint32_t node_id) override;
hsa_status_t FreeMemory(void *mem, size_t size) override;
hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr,
uint64_t queue_size_bytes, HsaEvent* event,
HsaQueueResource& queue_resource) const override;
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority,
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA::hsa_amd_queue_priority_internal_t priority,
void* queue_addr, uint64_t queue_size, HsaEvent* event) const override;
hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
@@ -0,0 +1,68 @@
/*
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
*
* SPDX-License-Identifier: MIT
*/
#ifndef HSA_RUNTIME_CORE_INC_COUNTED_QUEUE_MANAGER_H_
#define HSA_RUNTIME_CORE_INC_COUNTED_QUEUE_MANAGER_H_
#include "hsa.h"
#include "hsa_ext_amd.h"
#include "core/inc/agent.h"
#include "core/inc/runtime.h"
#include <map>
#include <mutex>
#include <vector>
#include <memory>
namespace rocr {
namespace core {
// Wrapper for a logical counted queue (unique handle + callback)
struct CountedQueue {
core::Queue* hw_queue; // this will store the public handle of HW Queue (hsa_queue_t)
void (*callback)(hsa_status_t, hsa_queue_t*, void*);
void* callback_data;
CountedQueue(core::Queue* hw, void (*cb)(hsa_status_t, hsa_queue_t*, void*), void* data)
: hw_queue(hw), callback(cb), callback_data(data) {}
};
// Manages the pool of counted queues for a single GPU agent
class CountedQueuePoolManager {
public:
explicit CountedQueuePoolManager(core::Agent*);
// Acquire a queue (either reuse or create new)
hsa_status_t AcquireQueue(hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority,
void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data,
uint64_t flags, hsa_queue_t** out_queue);
// Release a logical queue
hsa_status_t ReleaseQueue(hsa_queue_t* queue);
// Called during hsa_shutdown to remove all user and CP queues
void Cleanup();
private:
core::Queue* FindOrCreateHardwareQueue(hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority,
void (*callback)(hsa_status_t, hsa_queue_t*, void*),
void* data, uint64_t flags);
core::Agent* agent_; // pointer to the gpu agent that owns this pool
uint32_t max_hw_queues_;
std::mutex mutex_;
// Pool of hw queues by priority on the agent
std::map<HSA::hsa_amd_queue_priority_internal_t, std::vector<core::Queue*>> hw_queue_pools_;
// Map from unique handle to CountedQueue (hw queue, metadata per acquire request)
std::map<hsa_queue_t*, std::unique_ptr<CountedQueue>> counted_queues_;
};
} // namespace core
} // namespace rocr
#endif // HSA_RUNTIME_CORE_INC_COUNTED_QUEUE_MANAGER_H_
@@ -50,6 +50,7 @@
#include "core/inc/memory_region.h"
#include "hsakmt/hsakmttypes.h"
#include "inc/hsa.h"
#include "core/inc/hsa_internal.h"
namespace rocr {
namespace core {
@@ -159,7 +160,7 @@ public:
/// @param[in] event HsaEvent for event-driven callbacks.
/// @param[out] queue_resource Queue resource information populated by the driver.
virtual hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id,
void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
HsaQueueResource& queue_resource) const = 0;
@@ -175,7 +176,7 @@ public:
/// @param[in] queue_size_bytes Size of the queue's ring buffer in bytes.
/// @param[in] event HsaEvent for event-driven callbacks.
virtual hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, void* queue_addr,
HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr,
uint64_t queue_size_bytes, HsaEvent* event) const = 0;
/// @brief Set the CU mask for a queue.
@@ -60,7 +60,7 @@ class HostQueue : public Queue {
~HostQueue();
hsa_status_t Inactivate() override { return HSA_STATUS_SUCCESS; }
hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override {
hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override {
return HSA_STATUS_ERROR_INVALID_QUEUE;
}
@@ -381,6 +381,15 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi
uint64_t size, int64_t file_offset,
uint64_t *size_copied, int32_t *status);
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_counted_queue_acquire(hsa_agent_t agent, hsa_queue_type_t type,
hsa_amd_queue_priority_t priority,
void (*callback)(hsa_status_t status,
hsa_queue_t* source, void* data),
void* data, uint64_t flags, hsa_queue_t** queue);
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue);
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_enable_logging(uint8_t* flags, void* file);
@@ -44,10 +44,18 @@
#define HSA_RUNTIME_CORE_INC_HSA_INTERNAL_H
#include "inc/hsa.h"
#include "inc/hsa_ext_amd.h"
namespace rocr {
namespace HSA {
typedef enum hsa_amd_queue_priority_internal_s {
HSA_AMD_QUEUE_PRIORITY_LOW = HSA_AMD_QUEUE_PRIORITY_LOW,
HSA_AMD_QUEUE_PRIORITY_NORMAL = HSA_AMD_QUEUE_PRIORITY_NORMAL,
HSA_AMD_QUEUE_PRIORITY_HIGH = HSA_AMD_QUEUE_PRIORITY_HIGH,
HSA_AMD_QUEUE_PRIORITY_MAXIMUM = HSA_AMD_QUEUE_PRIORITY_HIGH + 1,
} hsa_amd_queue_priority_internal_t;
// Define core namespace interfaces - copy of function declarations in hsa.h
hsa_status_t hsa_init();
hsa_status_t hsa_shut_down();
@@ -66,7 +66,7 @@ class QueueWrapper : public Queue {
explicit QueueWrapper(std::unique_ptr<Queue> queue)
: Queue(static_cast<core::SharedQueue*>(core::Runtime::runtime_singleton_->system_allocator()(
sizeof(core::SharedQueue), 4096, 0, 0)),
0),
0, nullptr),
wrapped(std::move(queue)) {
memcpy(&amd_queue_, &wrapped->amd_queue_, sizeof(amd_queue_));
wrapped->set_public_handle(wrapped.get(), public_handle_);
@@ -77,7 +77,7 @@ class QueueWrapper : public Queue {
}
hsa_status_t Inactivate() override { return wrapped->Inactivate(); }
hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override {
hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override {
return wrapped->SetPriority(priority);
}
uint64_t LoadReadIndexAcquire() override { return wrapped->LoadReadIndexAcquire(); }
@@ -168,12 +168,15 @@ All funtions other than Convert and public_handle must be virtual.
*/
class Queue : public Checked<0xFA3906A679F9DB49> {
public:
Queue(SharedQueue* shared_queue, uint64_t queue_flags)
: Queue(shared_queue, queue_flags, false) {}
Queue(SharedQueue* shared_queue, uint64_t queue_flags, core::Agent* agent)
: Queue(shared_queue, queue_flags, false, agent) {}
Queue(SharedQueue* shared_queue, uint64_t queue_flags, bool pcie_write_ordering)
Queue(SharedQueue* shared_queue, uint64_t queue_flags, bool pcie_write_ordering, core::Agent* agent)
: amd_queue_(shared_queue->amd_queue),
use_count(0),
is_counted_queue(false),
shared_queue_(shared_queue),
agent_(agent),
flags_(queue_flags),
pcie_write_ordering_(pcie_write_ordering) {
public_handle_ = Convert(this);
@@ -213,7 +216,7 @@ class Queue : public Checked<0xFA3906A679F9DB49> {
virtual hsa_status_t Inactivate() = 0;
/// @brief Change the scheduling priority of the queue
virtual hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) = 0;
virtual hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) = 0;
/// @brief Reads the Read Index of Queue using Acquire semantics
///
@@ -372,6 +375,13 @@ class Queue : public Checked<0xFA3906A679F9DB49> {
hsa_queue_t* public_handle() const { return public_handle_; }
// Get a pointer to the agent that owns this queue
core::Agent* GetAgent() { return agent_; }
// @brief Attributes specifically for counted queue types
uint32_t use_count;
bool is_counted_queue;
typedef void* rtti_t;
bool IsType(rtti_t id) { return _IsA(id); }
@@ -401,6 +411,8 @@ class Queue : public Checked<0xFA3906A679F9DB49> {
SharedQueue* shared_queue_;
core::Agent* agent_; // pointer to the agent that owns this queue
hsa_queue_t* public_handle_;
/// Next available queue id.
@@ -67,7 +67,7 @@ namespace AMD {
AieAqlQueue::AieAqlQueue(core::SharedQueue* shared_queue, AieAgent* agent, size_t req_size_pkts,
uint32_t node_id, uint64_t flags)
: Queue(shared_queue, flags),
: Queue(shared_queue, flags, agent),
LocalSignal(0, false),
DoorbellSignal(signal()),
agent_(*agent),
@@ -104,7 +104,7 @@ AieAqlQueue::AieAqlQueue(core::SharedQueue* shared_queue, AieAgent* agent, size_
HsaQueueResource queue_resource = {};
hsa_status_t status =
agent_.driver().CreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 0, HSA_QUEUE_PRIORITY_NORMAL, 0,
agent_.driver().CreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 0, rocr::HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL, 0,
nullptr, queue_size_bytes_, nullptr, queue_resource);
if (status != HSA_STATUS_SUCCESS) {
throw AMD::hsa_exception(status, "Failed to create a hardware context for an AIE queue.");
@@ -135,7 +135,7 @@ hsa_status_t AieAqlQueue::Inactivate() {
return status;
}
hsa_status_t AieAqlQueue::SetPriority(HSA_QUEUE_PRIORITY priority) {
hsa_status_t AieAqlQueue::SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) {
return HSA_STATUS_SUCCESS;
}
@@ -277,6 +277,13 @@ hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute,
// Hardware doorbell supports AQL semantics.
*static_cast<uint64_t*>(value) = reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
break;
case HSA_QUEUE_INFO_USE_COUNT:
// AIE queues do not support counted queue features
*static_cast<uint32_t*>(value) = static_cast<uint32_t>(-1);
break;
case HSA_QUEUE_INFO_HW_ID:
*static_cast<uint32_t*>(value) = public_handle()->id;
break;
default:
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
@@ -81,7 +81,7 @@ namespace AMD {
AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_size_pkts,
HSAuint32 node_id, ScratchInfo& scratch, core::HsaEventCallback callback,
void* err_data, uint64_t flags)
: Queue(shared_queue, flags, !agent->is_xgmi_cpu_gpu()),
: Queue(shared_queue, flags, !agent->is_xgmi_cpu_gpu(), agent),
LocalSignal(0, false),
DoorbellSignal(signal()),
ring_buf_(nullptr),
@@ -97,7 +97,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
dynamicScratchState(0),
exceptionState(0),
suspended_(false),
priority_(HSA_QUEUE_PRIORITY_NORMAL),
priority_(HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL),
exception_signal_(nullptr) {
// Queue size is a function of several restrictions.
@@ -495,6 +495,21 @@ hsa_status_t AqlQueue::GetInfo(hsa_queue_info_attribute_t attribute, void* value
*(reinterpret_cast<uint64_t*>(value)) =
reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
break;
case HSA_QUEUE_INFO_USE_COUNT:
if (!is_counted_queue) {
*static_cast<uint32_t*>(value) = static_cast<uint32_t>(-1);
} else {
if (use_count == 0) {
// Queue was released
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
*static_cast<uint32_t*>(value) = use_count;
}
break;
case HSA_QUEUE_INFO_HW_ID:
// Return the hardware queue ID for both counted and non-counted queues
*static_cast<uint32_t*>(value) = public_handle()->id;
break;
default:
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
@@ -631,7 +646,7 @@ hsa_status_t AqlQueue::Inactivate() {
return HSA_STATUS_SUCCESS;
}
hsa_status_t AqlQueue::SetPriority(HSA_QUEUE_PRIORITY priority) {
hsa_status_t AqlQueue::SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) {
if (suspended_) {
return HSA_STATUS_ERROR_INVALID_QUEUE;
}
@@ -184,7 +184,7 @@ hsa_status_t BlitSdma<useGCR>::Initialize(const core::Agent& agent, bool use_xgm
// boolean flag
const HSA_QUEUE_TYPE kQueueType_ = rec_eng >= 0 ? HSA_QUEUE_SDMA_BY_ENG_ID :
(use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA);
if (agent_->driver().CreateQueue(agent_->node_id(), kQueueType_, 100, HSA_QUEUE_PRIORITY_MAXIMUM,
if (agent_->driver().CreateQueue(agent_->node_id(), kQueueType_, 100, HSA::HSA_AMD_QUEUE_PRIORITY_MAXIMUM,
rec_eng, queue_start_addr_, kQueueSize, nullptr,
queue_resource_) != HSA_STATUS_SUCCESS) {
LogPrint(HSA_AMD_LOG_FLAG_INFO, "Failed to create queue, size=%d, type=%d,"
@@ -99,6 +99,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
scratch_used_large_(0),
queues_(),
queue_pool_(this),
trap_code_buf_(NULL),
trap_code_buf_size_(0),
doorbell_queue_map_(NULL),
@@ -774,13 +775,13 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) {
void GpuAgent::InitDma() {
// Setup lazy init pointers on queues and blits.
auto queue_lambda = [this](HSA_QUEUE_PRIORITY priority = HSA_QUEUE_PRIORITY_NORMAL) {
auto queue_lambda = [this](HSA::hsa_amd_queue_priority_internal_t priority = HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL) {
auto queue = CreateInterceptibleQueue();
if (queue == nullptr)
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
"Internal queue creation failed.");
if (priority != HSA_QUEUE_PRIORITY_NORMAL)
if (priority != HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL)
if (queue->SetPriority(priority) != HSA_STATUS_SUCCESS)
throw AMD::hsa_exception(HSA_STATUS_ERROR,
"Failed to increase queue priority for PC Sampling");
@@ -795,7 +796,7 @@ void GpuAgent::InitDma() {
// Dedicated compute queue for PC Sampling CP-DMA commands. We need a dedicated queue that runs at
// highest priority because we do not want the CP-DMA commands to be delayed/blocked due to
// other dispatches/barriers that could be in the other AQL queues.
queues_[QueuePCSampling].reset([queue_lambda]() { return queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM); });
queues_[QueuePCSampling].reset([queue_lambda]() { return queue_lambda(HSA::HSA_AMD_QUEUE_PRIORITY_MAXIMUM); });
// Decide which engine to use for blits.
auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev, uint32_t rec_eng) {
@@ -924,6 +925,10 @@ void GpuAgent::PreloadBlits() {
void GpuAgent::ReleaseResources() {
if (this->Enabled()) {
this->Disable();
// Remove all shared hardware queues from pool
queue_pool_.Cleanup();
for (auto& blit : blits_) {
if (!blit.empty()) {
hsa_status_t status = blit->Destroy();
@@ -1829,6 +1834,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u
auto aql_queue = new AqlQueue(shared_queue, this, size, node_id(), scratch, event_callback, data,
flags);
*queue = aql_queue;
aql_queues_.push_back(aql_queue);
@@ -3519,5 +3525,17 @@ hsa_status_t GpuAgent::PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& sessi
return HSA_STATUS_SUCCESS;
}
hsa_status_t GpuAgent::AcquireCountedQueue(hsa_queue_type_t type,
HSA::hsa_amd_queue_priority_internal_t priority,
void (*callback)(hsa_status_t, hsa_queue_t*, void*),
void* data, uint64_t flags,
hsa_queue_t** out_queue) {
return queue_pool_.AcquireQueue(type, priority, callback, data, flags, out_queue);
}
hsa_status_t GpuAgent::ReleaseCountedQueue(hsa_queue_t* queue) {
return queue_pool_.ReleaseQueue(queue);
}
} // namespace amd
} // namespace rocr
@@ -0,0 +1,145 @@
/*
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
*
* SPDX-License-Identifier: MIT
*/
#include "core/inc/counted_queue_manager.h"
#include "core/inc/agent.h"
#include "core/inc/runtime.h"
namespace rocr {
namespace core {
constexpr size_t DEFAULT_QUEUE_SIZE = 16384;
CountedQueuePoolManager::CountedQueuePoolManager(core::Agent* agent) : agent_(agent) {
// Read in GPU_MAX_HW_QUEUES flag value
max_hw_queues_ = core::Runtime::runtime_singleton_->flag().cp_queues_limit();
}
hsa_status_t CountedQueuePoolManager::AcquireQueue(
hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority,
void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data, uint64_t flags,
hsa_queue_t** out_queue) {
std::lock_guard<std::mutex> lock(mutex_);
core::Queue* core_queue = FindOrCreateHardwareQueue(type, priority, callback, data, flags);
if (!core_queue) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
// Create unique SharedQueue structure and store the unique handle in it
SharedQueue* shared_queue = new (std::nothrow) SharedQueue();
if (!shared_queue) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
// Copy amd_queue from HW queue
shared_queue->amd_queue = core_queue->amd_queue_;
// Point to the SAME underlying core::Queue (shared HW queue)
shared_queue->core_queue = core_queue;
// Create a unique handle from this new SharedQueue
hsa_queue_t* unique_handle = &shared_queue->amd_queue.hsa_queue;
// Track metadata
auto counted_q = std::make_unique<CountedQueue>(core_queue, callback, data);
counted_queues_[unique_handle] = std::move(counted_q);
// Increment use count
core_queue->use_count++;
// Mark as a counted queue, if not already set
if (!core_queue->is_counted_queue) {
core_queue->is_counted_queue = true;
}
*out_queue = unique_handle;
return HSA_STATUS_SUCCESS;
}
core::Queue* CountedQueuePoolManager::FindOrCreateHardwareQueue(
hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority,
void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data, uint64_t flags) {
auto& pool = hw_queue_pools_[priority];
// Reuse least-used queue if max reached
if (pool.size() >= max_hw_queues_) {
core::Queue* least_used = nullptr;
uint32_t min_count = UINT32_MAX;
for (auto* q : pool) {
if (q->use_count < min_count) {
min_count = q->use_count;
least_used = q;
}
}
return least_used;
}
// Create a new hardware queue
core::Queue* cmd_queue = nullptr;
hsa_status_t status =
agent_->QueueCreate(DEFAULT_QUEUE_SIZE, type, 0, callback, data, 0, 0, &cmd_queue);
if (status != HSA_STATUS_SUCCESS) return nullptr;
status = cmd_queue->SetPriority(priority);
if (status != HSA_STATUS_SUCCESS) return nullptr;
cmd_queue->SetProfiling(true);
// Add to pool
pool.push_back(cmd_queue);
return cmd_queue;
}
hsa_status_t CountedQueuePoolManager::ReleaseQueue(hsa_queue_t* queue) {
std::lock_guard<std::mutex> lock(mutex_);
auto it = counted_queues_.find(queue);
if (it == counted_queues_.end()) return HSA_STATUS_ERROR;
CountedQueue* counted_q = it->second.get();
// Decrement internal ref count inside core::Queue object
if (counted_q->hw_queue->use_count > 0) {
counted_q->hw_queue->use_count--;
// Remove unique handle from map when it is no longer in use by an application
if (counted_q->hw_queue->use_count == 0) {
counted_queues_.erase(queue);
// free the associated shared_queue when removing the counted_queue
SharedQueue* shared = reinterpret_cast<SharedQueue*>(
reinterpret_cast<char*>(queue) - offsetof(SharedQueue, amd_queue.hsa_queue));
delete shared;
}
}
return HSA_STATUS_SUCCESS;
}
void CountedQueuePoolManager::Cleanup() {
std::lock_guard<std::mutex> lock(mutex_);
// Destroy hardware queues
for (auto& priority_pool : hw_queue_pools_) {
for (auto* hw_queue : priority_pool.second) {
if (hw_queue) {
hw_queue->Destroy();
}
}
priority_pool.second.clear();
}
hw_queue_pools_.clear();
// Clean up counted and shared queues
for (auto& cq : counted_queues_) {
// Recover SharedQueue from unique handle and free memory
hsa_queue_t* queue_handle = cq.first;
SharedQueue* shared = reinterpret_cast<SharedQueue*>(
reinterpret_cast<char*>(queue_handle) - offsetof(SharedQueue, amd_queue.hsa_queue));
delete shared;
}
counted_queues_.clear();
}
} // namespace core
} // namespace rocr
@@ -50,7 +50,7 @@ namespace core {
HostQueue::HostQueue(core::SharedQueue* shared_queue, hsa_region_t region, uint32_t ring_size,
hsa_queue_type32_t type, uint32_t features, hsa_signal_t doorbell_signal)
: Queue(shared_queue, 0), size_(ring_size) {
: Queue(shared_queue, 0, nullptr), size_(ring_size) {
HSA::hsa_memory_register(this, sizeof(HostQueue));
MAKE_NAMED_SCOPE_GUARD(registerGuard,
[&]() { HSA::hsa_memory_deregister(this, sizeof(HostQueue)); });
@@ -87,7 +87,7 @@ void HsaApiTable::Init() {
// they can add preprocessor macros on the new functions
constexpr size_t expected_core_api_table_size = 1016;
constexpr size_t expected_amd_ext_table_size = 624;
constexpr size_t expected_amd_ext_table_size = 640;
constexpr size_t expected_image_ext_table_size = 128;
constexpr size_t expected_finalizer_ext_table_size = 64;
constexpr size_t expected_tools_table_size = 64;
@@ -476,6 +476,8 @@ void HsaApiTable::UpdateAmdExts() {
amd_ext_api.hsa_amd_enable_logging_fn = AMD::hsa_amd_enable_logging;
amd_ext_api.hsa_amd_ais_file_write_fn = AMD::hsa_amd_ais_file_write;
amd_ext_api.hsa_amd_ais_file_read_fn = AMD::hsa_amd_ais_file_read;
amd_ext_api.hsa_amd_counted_queue_acquire_fn = AMD::hsa_amd_counted_queue_acquire;
amd_ext_api.hsa_amd_counted_queue_release_fn = AMD::hsa_amd_counted_queue_release;
amd_ext_api.hsa_amd_signal_wait_all_fn = AMD::hsa_amd_signal_wait_all;
amd_ext_api.hsa_amd_memory_get_preferred_copy_engine_fn = AMD::hsa_amd_memory_get_preferred_copy_engine;
amd_ext_api.hsa_amd_portable_export_dmabuf_v2_fn = AMD::hsa_amd_portable_export_dmabuf_v2;
@@ -62,6 +62,7 @@
#include "core/inc/ipc_signal.h"
#include "core/inc/runtime.h"
#include "core/inc/signal.h"
#include "core/inc/counted_queue_manager.h"
namespace rocr {
@@ -714,6 +715,10 @@ hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu
core::Queue* cmd_queue = core::Queue::Convert(queue);
IS_VALID(cmd_queue);
// Check if this a counted queue; NACK if it is
if (cmd_queue->is_counted_queue) return HSA_STATUS_ERROR_INVALID_QUEUE;
if (num_cu_mask_count != 0) IS_BAD_PTR(cu_mask);
if (num_cu_mask_count % 32 != 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
return cmd_queue->SetCUMasking(num_cu_mask_count, cu_mask);
@@ -1183,22 +1188,13 @@ hsa_status_t hsa_amd_queue_set_priority(hsa_queue_t* queue,
core::Queue* cmd_queue = core::Queue::Convert(queue);
IS_VALID(cmd_queue);
// Highest queue priority allowed for HSA user is HSA_QUEUE_PRIORITY_HIGH
// HSA_QUEUE_PRIORITY_MAXIMUM is reserved for PC Sampling and can only be allocated internally
// in ROCR
static std::map<hsa_amd_queue_priority_t, HSA_QUEUE_PRIORITY> ext_kmt_priomap = {
{HSA_AMD_QUEUE_PRIORITY_LOW, HSA_QUEUE_PRIORITY_MINIMUM},
{HSA_AMD_QUEUE_PRIORITY_NORMAL, HSA_QUEUE_PRIORITY_NORMAL},
{HSA_AMD_QUEUE_PRIORITY_HIGH, HSA_QUEUE_PRIORITY_HIGH},
};
// Check if this a counted queue; NACK if it is
if (cmd_queue->is_counted_queue) return HSA_STATUS_ERROR_INVALID_QUEUE;
auto priority_it = ext_kmt_priomap.find(priority);
if (priority_it == ext_kmt_priomap.end()) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
return cmd_queue->SetPriority(priority_it->second);
// Convert to ROCR internal priority type
HSA::hsa_amd_queue_priority_internal_t priority_ = static_cast<HSA::hsa_amd_queue_priority_internal_t>(priority);
return cmd_queue->SetPriority(priority_);
CATCH;
}
@@ -1532,7 +1528,7 @@ hsa_status_t HSA_API hsa_amd_queue_get_info(hsa_queue_t* _queue,
core::Queue* queue = core::Queue::Convert(_queue);
IS_VALID(queue);
return queue->GetInfo(attribute, value);
CATCH;
}
@@ -1576,6 +1572,72 @@ hsa_status_t hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, void *devic
CATCH;
}
hsa_status_t HSA_API
hsa_amd_counted_queue_acquire(hsa_agent_t agent,
hsa_queue_type_t type,
hsa_amd_queue_priority_t priority,
void (*callback)(hsa_status_t status,
hsa_queue_t* source,
void* data),
void* data,
uint64_t flags,
hsa_queue_t** queue) {
TRY;
IS_OPEN();
// Basic validation
if (queue == nullptr) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
// Check priority
if (priority < HSA_AMD_QUEUE_PRIORITY_LOW || priority > HSA_AMD_QUEUE_PRIORITY_HIGH) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
// Only support multi-producer queues
if (type != HSA_QUEUE_TYPE_MULTI) {
return HSA_STATUS_ERROR_INVALID_QUEUE_CREATION;
}
// Convert handle to internal agent
core::Agent* core_agent = core::Agent::Convert(agent);
IS_VALID(core_agent);
if (core_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) {
return HSA_STATUS_ERROR_INVALID_AGENT;
}
AMD::GpuAgent* gpu_agent = static_cast<AMD::GpuAgent*>(core_agent);
// Convert to ROCR internal priority type
HSA::hsa_amd_queue_priority_internal_t priority_ = static_cast<HSA::hsa_amd_queue_priority_internal_t>(priority);
// Call the queue pool manager
return gpu_agent->AcquireCountedQueue(type, priority_, callback, data, flags, queue);
CATCH;
}
hsa_status_t HSA_API
hsa_amd_counted_queue_release(hsa_queue_t* queue) {
TRY;
IS_OPEN();
// Basic validation
if (queue == nullptr) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
core::Queue* core_queue = core::Queue::Convert(queue);
IS_VALID(core_queue);
core::Agent* core_agent = core_queue->GetAgent();
IS_VALID(core_agent);
if (core_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) {
return HSA_STATUS_ERROR_INVALID_AGENT;
}
AMD::GpuAgent* gpu_agent = static_cast<AMD::GpuAgent*>(core_agent);
return gpu_agent->ReleaseCountedQueue(queue);
CATCH;
}
hsa_status_t hsa_amd_enable_logging(uint8_t* flags, void *file) {
TRY;
return core::Runtime::runtime_singleton_->EnableLogging(flags, file);
@@ -438,7 +438,9 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) {
hsa_status_t InterceptQueue::GetInfo(hsa_queue_info_attribute_t attribute, void* value) {
switch (attribute) {
case HSA_AMD_QUEUE_INFO_AGENT:
case HSA_AMD_QUEUE_INFO_DOORBELL_ID: {
case HSA_AMD_QUEUE_INFO_DOORBELL_ID:
case HSA_QUEUE_INFO_USE_COUNT:
case HSA_QUEUE_INFO_HW_ID: {
if (!AMD::AqlQueue::IsType(wrapped.get())) return HSA_STATUS_ERROR_INVALID_QUEUE;
AMD::AqlQueue* aqlQueue = static_cast<AMD::AqlQueue*>(wrapped.get());
@@ -306,6 +306,8 @@ class Flag {
core_dump_disable_ = (var == "1");
core_dump_pattern_ = os::GetEnvVar("HSA_COREDUMP_PATTERN");
var = os::GetEnvVar("GPU_MAX_HW_QUEUES");
cp_queues_limit_ = var.empty() ? 4 : atoi(var.c_str());
}
void parse_masks(uint32_t maxGpu, uint32_t maxCU) {
@@ -426,6 +428,8 @@ class Flag {
size_t co_dmacopy_size() const { return co_dmacopy_size_; }
uint32_t cp_queues_limit() const { return cp_queues_limit_; }
bool dev_mem_queue_buf() const { return dev_mem_queue_buf_; }
uint32_t signal_abort_timeout() const { return signal_abort_timeout_; }
@@ -545,6 +549,8 @@ class Flag {
bool enable_core_dump_progress_ = false;
std::string core_dump_pattern_;
uint32_t cp_queues_limit_;
// Map GPU index post RVD to its default cu mask.
std::map<uint32_t, std::vector<uint32_t>> cu_mask_;
@@ -266,6 +266,8 @@ global:
hsa_ext_image_create_v2;
hsa_ext_image_data_get_info_v2;
hsa_ext_image_destroy_v2;
hsa_amd_counted_queue_acquire;
hsa_amd_counted_queue_release;
local:
*;
};
@@ -273,6 +273,8 @@ struct AmdExtTable {
decltype(hsa_amd_portable_export_dmabuf_v2)* hsa_amd_portable_export_dmabuf_v2_fn;
decltype(hsa_amd_ais_file_write)* hsa_amd_ais_file_write_fn;
decltype(hsa_amd_ais_file_read)* hsa_amd_ais_file_read_fn;
decltype(hsa_amd_counted_queue_acquire)* hsa_amd_counted_queue_acquire_fn;
decltype(hsa_amd_counted_queue_release)* hsa_amd_counted_queue_release_fn;
};
// Table to export HSA Core Runtime Apis
@@ -58,7 +58,7 @@
// Step Ids of the Api tables exported by Hsa Core Runtime
#define HSA_API_TABLE_STEP_VERSION 0x01
#define HSA_CORE_API_TABLE_STEP_VERSION 0x00
#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x08
#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x09
#define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00
#define HSA_IMAGE_API_TABLE_STEP_VERSION 0x01
// Rocprofiler just checks HSA_MAGE_EXT_API_TABLE_STEP_VERSION
@@ -68,7 +68,7 @@
* - 1.15 - hsa_amd_register_system_event_handler: HSA_AMD_SYSTEM_SHUTDOWN
*/
#define HSA_AMD_INTERFACE_VERSION_MAJOR 1
#define HSA_AMD_INTERFACE_VERSION_MINOR 15
#define HSA_AMD_INTERFACE_VERSION_MINOR 16
#ifdef __cplusplus
extern "C" {
@@ -3666,6 +3666,18 @@ typedef enum {
* The type of this attribute is uint64_t.
*/
HSA_AMD_QUEUE_INFO_DOORBELL_ID,
/*
* Returns how many times the underlying hardware queue has been shared.
* @p value will be set to -1 if this queue was not allocated using
* hsa_amd_counted_queue_acquire. The type of this attribute is uint32_t.
*/
HSA_QUEUE_INFO_USE_COUNT,
/*
* Returns a unique ID representing the HW resource used by a counted queue. Two queues
* with the same HW_ID use the same underlying hardware queue. This query can be
* used on counted and non-counted queues. The type of this attribute is uint32_t.
*/
HSA_QUEUE_INFO_HW_ID,
} hsa_queue_info_attribute_t;
hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute,
@@ -3760,6 +3772,81 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi
uint64_t size, int64_t file_offset,
uint64_t *size_copied, int32_t *status);
/**
* @brief Create a queue that is limited by the GPU_MAX_HW_QUEUES environment variable.
*
* This underlying hardware queue returned by this function may be shared by other queues. For
* each priority this function will create a new hardware queue as long as the number of queues
* is less than GPU_MAX_HW_QUEUES. Once the GPU_MAX_HW_QUEUES limit is reached, this function will
* stop creating new hardware queues and return a reference to an existing queue of the requested
* @p priority instead. Each successful call will return a different @p queue handle.
* The hsa_amd_queue_get_info API can be used to determine whether this queue is currently shared.
*
* When there are multiple eligible hardware queues available, the queue with the lowest
* HSA_QUEUE_INFO_USE_COUNT will be returned.
*
* For each successful call, hsa_amd_counted_queue_release should be called to release the
* HSA_QUEUE_INFO_USE_COUNT. After release, the queue handle becomes invalid and must not be used.
*
* hsa_amd_queue_set_priority and hsa_amd_queue_cu_set_mask cannot be used on counted queues.
*
* @param[in] agent Agent where to create the queue
*
* @param[in] type For future use. HSA_QUEUE_TYPE_MULTI is the only valid option.
*
* @param[in] priority Associated priority. The GPU_MAX_HW_QUEUES limit is counted for each priority
*
* @param[in] callback callback invoked by the HSA runtime for every asynchronous event related to
* the newly created queue. May be NULL. The HSA runtime passes three arguments to the callback: a
* code identifying the event that triggered the invocation, a pointer to the queue where the event
* originated, and the application data.
*
* @param[in] data Application data that is passed to @p callback on every iteration. May be NULL.
*
* @param[in] flags For future use. Ignored.
*
* @param[out] queue Unique handle to reference the newly created queue.
*
* @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
*
* @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized.
*
* @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is failure to allocate the resources required
* by the implementation.
*
* @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid or not a GPU agent.
*
* @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p type is not HSA_QUEUE_TYPE_MULTI.
*
* @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid priority or NULL queue pointer.
*/
hsa_status_t HSA_API hsa_amd_counted_queue_acquire(hsa_agent_t agent, hsa_queue_type_t type,
hsa_amd_queue_priority_t priority,
void (*callback)(hsa_status_t status,
hsa_queue_t* source, void* data),
void* data, uint64_t flags, hsa_queue_t** queue);
/**
* @brief Release a counted queue and decrements its use count.
*
* Releases a queue that was previously acquired using hsa_amd_counted_queue_acquire.
* Each call to this API decrements the internal use count HSA_QUEUE_INFO_USE_COUNT
* of the underlying hardware. After this call, queue handle is invalid and must not be used.
* Once created, the hardware queue is retained until hsa_shutdown is called to avoid costly
* overhead of repeatedly creating new hardware queues, allowing them to be reused.
*
*
* @param[in] queue Counted queue handle returned from hsa_amd_counted_queue_acquire.
* Must not be NULL.
*
* @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
* @retval ::HSA_STATUS_ERROR Invalid queue or queue was already released.
* @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized.
* @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The @p queue is NULL or previously released.
* @retval ::HSA_STATUS_ERROR_INVALID_AGENT The queue's agent is invalid or not a GPU agent.
*/
hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue);
/**
* @brief logging types
*/