From 793755532fe79fe29b42c0c53a644a97f695d1b7 Mon Sep 17 00:00:00 2001 From: pghoshamd Date: Wed, 21 Jan 2026 15:30:04 -0500 Subject: [PATCH] SWDEV-561708 Initial shared queue pool apis (#1614) * SWDEV-561708 Initial shared queue pool apis * Validate params; some fixes in callback function (but still needs to be checked) * Dtor cleanup * minor * Enable profiling; remove callback since aql_queue takes care of it * setPriority and setCuMask APIs updated for counted queues * Increasing step and minor version for rocprofiler * Tests for CountedQueueManager * tests * Code refactored to make pool manager part of GpuAgent only (incomplete); unique handles issue pending * Refactored code to support CQM inside GpuAgent and unique handles; multithreaded test added * Changed to ASSERT_SUCCESS macros for all tests * RIng buffer overflow test added * tests fixed; cleanup added at hsa_shutdown * priority conversion table changes * Compiler warnings fixed * Rewrite 1 test; add desc and improve SetUp() code * Improvement * Unififed getinfo for both counted and non-counted queues * Address PR feedback * Addressing feedback: memleak, data type mismatch, documentation * improve comment * format * Missing HSA_API macros for roctracer * Revert "Addressing feedback: memleak, data type mismatch, documentation" This reverts commit 5e498a55fb3640e00d06cec63dcec79293fb23de. * Improving acquire api doc * release api doc improved * error codes for release api doc --- .../suites/functional/counted_queues.cc | 855 ++++++++++++++++++ .../suites/functional/counted_queues.h | 68 ++ .../rocrtst/suites/test_common/main.cc | 64 ++ .../rocrtst/suites/test_common/main.h | 1 - .../runtime/hsa-runtime/CMakeLists.txt | 1 + .../core/common/hsa_table_interface.cpp | 12 + .../core/driver/kfd/amd_kfd_driver.cpp | 36 +- .../driver/virtio/amd_kfd_virtio_driver.cpp | 4 +- .../core/driver/xdna/amd_xdna_driver.cpp | 4 +- .../hsa-runtime/core/inc/amd_aie_aql_queue.h | 2 +- .../hsa-runtime/core/inc/amd_aql_queue.h | 4 +- .../hsa-runtime/core/inc/amd_gpu_agent.h | 14 + .../hsa-runtime/core/inc/amd_kfd_driver.h | 4 +- .../hsa-runtime/core/inc/amd_virtio_driver.h | 4 +- .../hsa-runtime/core/inc/amd_xdna_driver.h | 4 +- .../core/inc/counted_queue_manager.h | 68 ++ .../runtime/hsa-runtime/core/inc/driver.h | 5 +- .../runtime/hsa-runtime/core/inc/host_queue.h | 2 +- .../hsa-runtime/core/inc/hsa_ext_amd_impl.h | 9 + .../hsa-runtime/core/inc/hsa_internal.h | 8 + .../hsa-runtime/core/inc/intercept_queue.h | 4 +- .../runtime/hsa-runtime/core/inc/queue.h | 20 +- .../core/runtime/amd_aie_aql_queue.cpp | 13 +- .../core/runtime/amd_aql_queue.cpp | 21 +- .../core/runtime/amd_blit_sdma.cpp | 2 +- .../core/runtime/amd_gpu_agent.cpp | 24 +- .../core/runtime/counted_queue_manager.cpp | 145 +++ .../hsa-runtime/core/runtime/host_queue.cpp | 2 +- .../core/runtime/hsa_api_trace.cpp | 4 +- .../hsa-runtime/core/runtime/hsa_ext_amd.cpp | 94 +- .../core/runtime/intercept_queue.cpp | 4 +- .../runtime/hsa-runtime/core/util/flag.h | 6 + .../runtime/hsa-runtime/hsacore.so.def | 2 + .../runtime/hsa-runtime/inc/hsa_api_trace.h | 2 + .../hsa-runtime/inc/hsa_api_trace_version.h | 2 +- .../runtime/hsa-runtime/inc/hsa_ext_amd.h | 89 +- 36 files changed, 1545 insertions(+), 58 deletions(-) create mode 100644 projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc create mode 100644 projects/rocr-runtime/rocrtst/suites/functional/counted_queues.h create mode 100644 projects/rocr-runtime/runtime/hsa-runtime/core/inc/counted_queue_manager.h create mode 100644 projects/rocr-runtime/runtime/hsa-runtime/core/runtime/counted_queue_manager.cpp diff --git a/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc b/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc new file mode 100644 index 0000000000..da65940777 --- /dev/null +++ b/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc @@ -0,0 +1,855 @@ +/* + * Copyright © Advanced Micro Devices, Inc., or its affiliates. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include +#include + +#include "suites/functional/counted_queues.h" +#include "hsa/hsa_ext_amd.h" +#include "hsa/hsa.h" +#include "common/base_rocr_utils.h" +#include "gtest/gtest.h" +#include "common/os.h" + + +static bool VerifyResult(uint32_t* ar, size_t sz) { + for (size_t i = 0; i < sz; ++i) { + if (i * i != ar[i]) { + return false; + } + } + return true; +} + +CountedQueuesTest::CountedQueuesTest() : TestBase() { + set_title("RocR Counted Queues Test"); + set_description( + "This test validates the behavior of Shared Counted Queues managed by the " + "Counted Queue Manager in a scenario where different libraries use CP " + "Queues and it avoids oversubscription and a subsequent performance degradation."); +} + +CountedQueuesTest::~CountedQueuesTest() {} + +void CountedQueuesTest::SetUp() { + const std::string kDefaultLimit = "2"; + static const std::unordered_map kQueueLimits = { + {"Counted_Queue_Multithreaded_Dispatch_Test", "1"}, + {"Counted_Queue_Overflow_And_Wraparound_Test", "1"}, + {"Counted_Queue_Same_Priority_Max_Limit_Test", "4"}}; + + const ::testing::TestInfo* test_info = ::testing::UnitTest::GetInstance()->current_test_info(); + if (test_info) { + const std::string test_name = test_info->name(); + + // Find the current test's required limit from map and set the env var + // Set default HW queue limit if not found in map + auto it = kQueueLimits.find(test_name); + const std::string& limit = (it != kQueueLimits.end()) ? it->second : kDefaultLimit; + rocrtst::SetEnv("GPU_MAX_HW_QUEUES", limit.c_str()); + } + + TestBase::SetUp(); +} + +void CountedQueuesTest::Run() { + // Compare required profile for this test case with what we're actually + // running on + if (!rocrtst::CheckProfile(this)) { + return; + } + TestBase::Run(); +} + +void CountedQueuesTest::Close() { + // This will close handles opened within rocrtst utility calls and call + // hsa_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + +void CountedQueuesTest::DisplayResults() const { + // Compare required profile for this test case with what we're actually + // running on + if (!rocrtst::CheckProfile(this)) { + return; + } + TestBase::DisplayResults(); +} + +void CountedQueuesTest::DisplayTestInfo() { TestBase::DisplayTestInfo(); } + +void CountedQueuesTest::CountedQueueBasicApiTest() { + // Find all gpu agents + std::vector gpus; + ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus)); + + hsa_queue_t* queue = nullptr; + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire( + gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, &queue)); + ASSERT_NE(queue, nullptr); + + // Query counted queue and check internal reference count + int32_t use_count = 0; + ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count)); + EXPECT_EQ(use_count, 1); // should be 1 after acquire + + // Release the queue + ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue)); + + // Check that ref count is back to 0 after release + hsa_status_t status; + status = hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count); + ASSERT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT); +} + +void CountedQueuesTest::CountedQueues_SamePriority_MaxLimitTest() { + hsa_status_t status; + + // Find all GPU agents + std::vector gpus; + ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus)); + ASSERT_FALSE(gpus.empty()); + + const int NUM_QUEUES = 50; + const int MAX_HW_QUEUES = std::stoi(rocrtst::GetEnv("GPU_MAX_HW_QUEUES")); + + std::vector queues(NUM_QUEUES, nullptr); + std::vector hw_ids(NUM_QUEUES, 0); + + // Acquire NUM_QUEUES counted queues + for (int i = 0; i < NUM_QUEUES; i++) { + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI, + HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, + &queues[i])); + ASSERT_NE(queues[i], nullptr); + } + + // Query HW IDs + for (int i = 0; i < NUM_QUEUES; i++) { + ASSERT_SUCCESS(hsa_amd_queue_get_info(queues[i], HSA_QUEUE_INFO_HW_ID, &hw_ids[i])); + } + + // Sort and remove duplicate HW IDs + std::sort(hw_ids.begin(), hw_ids.end()); + auto it = std::unique(hw_ids.begin(), hw_ids.end()); + hw_ids.resize(std::distance(hw_ids.begin(), it)); + + // Ensure hardware queue count matches MAX_HW_QUEUES + ASSERT_EQ(hw_ids.size(), MAX_HW_QUEUES); + + // Verify even distribution of logical queues over HW queues + // Map HW ID -> use count + std::unordered_map use_counts; + + for (auto* q : queues) { + uint32_t hwid = 0, count = 0; + ASSERT_SUCCESS(hsa_amd_queue_get_info(q, HSA_QUEUE_INFO_HW_ID, &hwid)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(q, HSA_QUEUE_INFO_USE_COUNT, &count)); + use_counts[hwid] = count; // overwrites but counts are per-hw, same across queues + } + + // Gather all use-counts for fairness check + std::vector dist; + for (auto& kv : use_counts) { + dist.push_back(kv.second); + } + + ASSERT_EQ(dist.size(), MAX_HW_QUEUES); + + // Fair distribution: difference should not exceed 1 + auto [min_it, max_it] = std::minmax_element(dist.begin(), dist.end()); + EXPECT_LE(*max_it - *min_it, 1); + + // Release queues + for (auto* q : queues) { + ASSERT_SUCCESS(hsa_amd_counted_queue_release(q)); + } + + // After release, querying use-count should return invalid argument + for (auto* q : queues) { + uint32_t tmp = 0; + EXPECT_EQ(hsa_amd_queue_get_info(q, HSA_QUEUE_INFO_USE_COUNT, &tmp), + HSA_STATUS_ERROR_INVALID_ARGUMENT); + } +} + +void CountedQueuesTest::InvalidArgsTest() { + hsa_status_t status; + hsa_queue_t* q = nullptr; + + // Find all gpu agents + std::vector gpus; + ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus)); + + // Invalid queue pointer + status = hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, + nullptr, nullptr, 0, nullptr); + EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT); + + // Invalid priority + const hsa_amd_queue_priority_t invalid_priority = static_cast(999); + status = hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI, invalid_priority, nullptr, + nullptr, 0, &q); + EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT); + + // Support multi producer queues only + status = hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_SINGLE, HSA_AMD_QUEUE_PRIORITY_LOW, + nullptr, nullptr, 0, &q); + EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_QUEUE_CREATION); + + // Check release API params + hsa_queue_t* queue = nullptr; + status = hsa_amd_counted_queue_release(queue); + EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT); +} + +void CountedQueuesTest::CountedQueuesAllPrioritiesLimitTest() { + hsa_status_t status; + + // Find all gpu agents + std::vector gpus; + ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus)); + + // Acquire 2 queues per priority (total 6 queues) + hsa_queue_t *low1 = nullptr, *low2 = nullptr, *low3 = nullptr; + hsa_queue_t *normal1 = nullptr, *normal2 = nullptr, *normal3 = nullptr; + hsa_queue_t *high1 = nullptr, *high2 = nullptr, *high3 = nullptr; + + // Low Priority + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire( + gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &low1)); + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire( + gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &low2)); + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI, + HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, + &low3)); // should reuse low1 + + // Normal Priority + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire( + gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, &normal1)); + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire( + gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, &normal2)); + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI, + HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, + &normal3)); // should reuse normal1 + + // High Priority + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire( + gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_HIGH, nullptr, nullptr, 0, &high1)); + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire( + gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_HIGH, nullptr, nullptr, 0, &high2)); + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire( + gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_HIGH, nullptr, nullptr, 0, &high3)); + + // Verify reuse and independence per priority + uint32_t low_id1 = 0, low_id2 = 0, low_id3 = 0; + uint32_t norm_id1 = 0, norm_id2 = 0, norm_id3 = 0; + uint32_t high_id1 = 0, high_id2 = 0, high_id3 = 0; + + ASSERT_SUCCESS(hsa_amd_queue_get_info(low1, HSA_QUEUE_INFO_HW_ID, &low_id1)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(low2, HSA_QUEUE_INFO_HW_ID, &low_id2)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(low3, HSA_QUEUE_INFO_HW_ID, &low_id3)); + + ASSERT_SUCCESS(hsa_amd_queue_get_info(normal1, HSA_QUEUE_INFO_HW_ID, &norm_id1)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(normal2, HSA_QUEUE_INFO_HW_ID, &norm_id2)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(normal3, HSA_QUEUE_INFO_HW_ID, &norm_id3)); + + ASSERT_SUCCESS(hsa_amd_queue_get_info(high1, HSA_QUEUE_INFO_HW_ID, &high_id1)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(high2, HSA_QUEUE_INFO_HW_ID, &high_id2)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(high3, HSA_QUEUE_INFO_HW_ID, &high_id3)); + + // Within same priority: max 2 unique HW queues + EXPECT_NE(low_id1, low_id2); + EXPECT_TRUE(low_id3 == low_id1); + + EXPECT_NE(norm_id1, norm_id2); + EXPECT_TRUE(norm_id3 == norm_id1); + + EXPECT_NE(high_id1, high_id2); + EXPECT_TRUE(high_id3 == high_id1); + + // Ensure different queues are used across priorities + EXPECT_NE(low_id1, norm_id1); + EXPECT_NE(norm_id1, high_id1); + EXPECT_NE(low_id1, high_id1); + + // Verify use counts of first two HW queues + uint32_t low_use1 = 0, low_use2 = 0, low_use3 = 0; + uint32_t norm_use1 = 0, norm_use2 = 0, norm_use3 = 0; + uint32_t high_use1 = 0, high_use2 = 0, high_use3 = 0; + + ASSERT_SUCCESS(hsa_amd_queue_get_info(low1, HSA_QUEUE_INFO_USE_COUNT, &low_use1)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(low2, HSA_QUEUE_INFO_USE_COUNT, &low_use2)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(low3, HSA_QUEUE_INFO_USE_COUNT, &low_use3)); + + ASSERT_SUCCESS(hsa_amd_queue_get_info(normal1, HSA_QUEUE_INFO_USE_COUNT, &norm_use1)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(normal2, HSA_QUEUE_INFO_USE_COUNT, &norm_use2)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(normal3, HSA_QUEUE_INFO_USE_COUNT, &norm_use3)); + + ASSERT_SUCCESS(hsa_amd_queue_get_info(high1, HSA_QUEUE_INFO_USE_COUNT, &high_use1)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(high2, HSA_QUEUE_INFO_USE_COUNT, &high_use2)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(high3, HSA_QUEUE_INFO_USE_COUNT, &high_use3)); + + EXPECT_EQ(low_use1, 2); + EXPECT_EQ(low_use2, 1); + EXPECT_TRUE(low_use1 == low_use3); // same HW queues, same ref count + + EXPECT_EQ(norm_use1, 2); + EXPECT_EQ(norm_use2, 1); + EXPECT_TRUE(norm_use1 == norm_use3); + + EXPECT_EQ(high_use1, 2); + EXPECT_EQ(high_use2, 1); + EXPECT_TRUE(high_use1 == high_use3); + + // Release all queues + ASSERT_SUCCESS(hsa_amd_counted_queue_release(low1)); + ASSERT_SUCCESS(hsa_amd_counted_queue_release(low2)); + ASSERT_SUCCESS(hsa_amd_counted_queue_release(low3)); + + ASSERT_SUCCESS(hsa_amd_counted_queue_release(normal1)); + ASSERT_SUCCESS(hsa_amd_counted_queue_release(normal2)); + ASSERT_SUCCESS(hsa_amd_counted_queue_release(normal3)); + + ASSERT_SUCCESS(hsa_amd_counted_queue_release(high1)); + ASSERT_SUCCESS(hsa_amd_counted_queue_release(high2)); + ASSERT_SUCCESS(hsa_amd_counted_queue_release(high3)); +} + +void CountedQueuesTest::CountedQueuesSetPriorityNackTest() { + hsa_status_t status; + + // Find all gpu agents + std::vector gpus; + ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus)); + + // Create a counted queue + hsa_queue_t* queue = nullptr; + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire( + gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &queue)); + EXPECT_NE(queue, nullptr); + + // Try to set priority on this queue; should fail + status = hsa_amd_queue_set_priority(queue, HSA_AMD_QUEUE_PRIORITY_HIGH); + EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_QUEUE); + + // release queue + ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue)); +} + +void CountedQueuesTest::CountedQueuesSetCUMaskNackTest() { + hsa_status_t status; + + // Find all gpu agents + std::vector gpus; + ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus)); + + // Create a counted queue + hsa_queue_t* queue = nullptr; + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire( + gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &queue)); + EXPECT_NE(queue, nullptr); + + // Attempt to set CU mask on counted queue; should fail + uint32_t cu_mask[32] = {0}; // dummy mask + status = hsa_amd_queue_cu_set_mask(queue, 1, cu_mask); + EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_QUEUE); + + // release queue + ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue)); +} + +void CountedQueuesTest::CountedQueuesDispatchTest() { + hsa_status_t status; + + // Common setup + ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this)); + ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this)); + + // Load kernel + set_kernel_file_name("test_case_template_kernels.hsaco"); + set_kernel_name("square"); + ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, gpu_device1())); + + hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()}; + + // Allocate source buffer + void* src_buffer = nullptr; + ASSERT_SUCCESS(hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &src_buffer)); + ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, src_buffer)); + + // Initialize source data + for (uint32_t i = 0; i < 256; ++i) { + reinterpret_cast(src_buffer)[i] = i; + } + + // Allocate destination buffer + void* dst_buffer = nullptr; + ASSERT_SUCCESS(hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &dst_buffer)); + ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, dst_buffer)); + + // Create completion signal + hsa_signal_t completion_signal; + ASSERT_SUCCESS(hsa_signal_create(1, 0, nullptr, &completion_signal)); + + // Get a counted queue + hsa_queue_t* queue = nullptr; + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(*gpu_device1(), HSA_QUEUE_TYPE_MULTI, + HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, + &queue)); + EXPECT_NE(queue, nullptr); + + // Query queue info + int32_t use_count = 0; + uint32_t hw_id = 0; + ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_HW_ID, &hw_id)); + EXPECT_EQ(use_count, 1); + + // Prepare kernel arguments + struct __attribute__((aligned(16))) local_args_t { + uint32_t* dstArray; + uint32_t* srcArray; + uint32_t size; + uint32_t pad; + uint64_t global_offset_x; + uint64_t global_offset_y; + uint64_t global_offset_z; + uint64_t printf_buffer; + uint64_t default_queue; + uint64_t completion_action; + } local_args; + + local_args.dstArray = reinterpret_cast(dst_buffer); + local_args.srcArray = reinterpret_cast(src_buffer); + local_args.size = 256; + local_args.global_offset_x = 0; + local_args.global_offset_y = 0; + local_args.global_offset_z = 0; + local_args.printf_buffer = 0; + local_args.default_queue = 0; + local_args.completion_action = 0; + + // Allocate kernel arguments + void* kernarg_address = nullptr; + ASSERT_SUCCESS( + hsa_amd_memory_pool_allocate(kern_arg_pool(), sizeof(local_args), 0, &kernarg_address)); + ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, kernarg_address)); + memcpy(kernarg_address, &local_args, sizeof(local_args)); + + // Dispatch loop + int it = num_iteration() * 5; + const uint32_t queue_mask = queue->size - 1; + + for (int i = 0; i < it; i++) { + // Reserve a slot in the queue + uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1); + + // Get pointer to the reserved packet slot + hsa_kernel_dispatch_packet_t* queue_aql_packet = + &(reinterpret_cast(queue->base_address))[index & queue_mask]; + + // Fill packet fields + queue_aql_packet->setup = 1; + queue_aql_packet->workgroup_size_x = 256; + queue_aql_packet->workgroup_size_y = 1; + queue_aql_packet->workgroup_size_z = 1; + queue_aql_packet->grid_size_x = 256; + queue_aql_packet->grid_size_y = 1; + queue_aql_packet->grid_size_z = 1; + queue_aql_packet->private_segment_size = 0; + queue_aql_packet->group_segment_size = 0; + queue_aql_packet->kernel_object = kernel_object(); + queue_aql_packet->kernarg_address = kernarg_address; + queue_aql_packet->completion_signal = completion_signal; + + // Write header for packet + uint32_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + __atomic_store_n(reinterpret_cast(&queue_aql_packet->header), header, + __ATOMIC_RELEASE); + + // Ring doorbell to notify GPU + hsa_signal_store_screlease(queue->doorbell_signal, index); + + // Wait for completion signal + while (hsa_signal_wait_scacquire(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, + HSA_WAIT_STATE_ACTIVE)) { + } + + // Reset signal for next iteration + hsa_signal_store_screlease(completion_signal, 1); + + // Verify results + ASSERT_TRUE(VerifyResult(reinterpret_cast(dst_buffer), 256)); + } + + // Verify use count before release + ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count)); + EXPECT_EQ(use_count, 1); + + // Release the counted queue + ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue)); + + // Verify queue info returns error after release + status = hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count); + ASSERT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT); + + // Cleanup + ASSERT_SUCCESS(hsa_amd_memory_pool_free(kernarg_address)); + ASSERT_SUCCESS(hsa_signal_destroy(completion_signal)); + ASSERT_SUCCESS(hsa_amd_memory_pool_free(src_buffer)); + ASSERT_SUCCESS(hsa_amd_memory_pool_free(dst_buffer)); +} + +void CountedQueuesTest::CountedQueuesMultithreadedDispatchTest() { + hsa_status_t status; + + // Common setup + ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this)); + ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this)); + + // Load kernel + set_kernel_file_name("test_case_template_kernels.hsaco"); + set_kernel_name("square"); + ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, gpu_device1())); + + hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()}; + + // Shared source buffer (read-only) + void* shared_src_buffer = nullptr; + ASSERT_SUCCESS( + hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &shared_src_buffer)); + ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, shared_src_buffer)); + + // Initialize source data + for (uint32_t i = 0; i < 256; ++i) { + reinterpret_cast(shared_src_buffer)[i] = i; + } + + // Structures for validation later on + std::mutex hwIdsMutex; + std::vector allHwIds; + std::atomic maxUseCount{0}; + + auto func = [&]() { + // local dest buffer for each user application + void* local_dst_buffer = nullptr; + ASSERT_SUCCESS( + hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &local_dst_buffer)); + ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, local_dst_buffer)); + + // Local completion signal for every user application + hsa_signal_t local_signal; + ASSERT_SUCCESS(hsa_signal_create(1, 0, nullptr, &local_signal)); + + // Get a counted queue + hsa_queue_t* queue = nullptr; + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(*gpu_device1(), HSA_QUEUE_TYPE_MULTI, + HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, + &queue)); + EXPECT_NE(queue, nullptr); + + if (queue == nullptr) { + hsa_signal_destroy(local_signal); + hsa_amd_memory_pool_free(local_dst_buffer); + return; + } + + // Store query results for later analysis + int32_t localUseCount = 0; + uint32_t localHwId = 0; + + ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &localUseCount)); + ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_HW_ID, &localHwId)); + + // Update use_count if it is larger than previous value + int expected = maxUseCount.load(); + while (localUseCount > expected && + !maxUseCount.compare_exchange_weak(expected, localUseCount)) { + } + + // Store hw id for validation later on + { + std::lock_guard lock(hwIdsMutex); + allHwIds.push_back(localHwId); + } + + struct __attribute__((aligned(16))) local_args_t { + uint32_t* dstArray; + uint32_t* srcArray; + uint32_t size; + uint32_t pad; + uint64_t global_offset_x; + uint64_t global_offset_y; + uint64_t global_offset_z; + uint64_t printf_buffer; + uint64_t default_queue; + uint64_t completion_action; + } local_args; + + local_args.dstArray = reinterpret_cast(local_dst_buffer); + local_args.srcArray = reinterpret_cast(shared_src_buffer); + local_args.size = 256; + local_args.global_offset_x = 0; + local_args.global_offset_y = 0; + local_args.global_offset_z = 0; + local_args.printf_buffer = 0; + local_args.default_queue = 0; + local_args.completion_action = 0; + + void* kernarg_address = nullptr; + ASSERT_SUCCESS( + hsa_amd_memory_pool_allocate(kern_arg_pool(), sizeof(local_args), 0, &kernarg_address)); + ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, kernarg_address)); + + memcpy(kernarg_address, &local_args, sizeof(local_args)); + + // Dispatch loop + int it = num_iteration() * 5; + const uint32_t queue_mask = queue->size - 1; + + for (int i = 0; i < it; i++) { + // Reserve a slot in the queue + uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1); + + // Get pointer to the reserved packet slot and validate address + hsa_kernel_dispatch_packet_t* queue_aql_packet = &( + reinterpret_cast(queue->base_address))[index & queue_mask]; + ASSERT_EQ(queue_aql_packet, + reinterpret_cast(queue->base_address) + (index & queue_mask)); + + // Fill packet fields + queue_aql_packet->setup = 1; + queue_aql_packet->workgroup_size_x = 256; + queue_aql_packet->workgroup_size_y = 1; + queue_aql_packet->workgroup_size_z = 1; + queue_aql_packet->grid_size_x = 256; + queue_aql_packet->grid_size_y = 1; + queue_aql_packet->grid_size_z = 1; + queue_aql_packet->private_segment_size = 0; + queue_aql_packet->group_segment_size = 0; + queue_aql_packet->kernel_object = kernel_object(); + queue_aql_packet->kernarg_address = kernarg_address; + queue_aql_packet->completion_signal = local_signal; + + // Write header for packet + uint32_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + __atomic_store_n(reinterpret_cast(&queue_aql_packet->header), header, + __ATOMIC_RELEASE); + + // Ring doorbell to notify GPU + hsa_signal_store_screlease(queue->doorbell_signal, index); + + // Wait for completion signal to be less than 1 + while (hsa_signal_wait_scacquire(local_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, + HSA_WAIT_STATE_ACTIVE)) { + } + + // Reset signal for next iteration + hsa_signal_store_screlease(local_signal, 1); + + ASSERT_TRUE(VerifyResult(reinterpret_cast(local_dst_buffer), 256)); + } + + // Cleanup + hsa_amd_memory_pool_free(kernarg_address); + hsa_signal_destroy(local_signal); + hsa_amd_memory_pool_free(local_dst_buffer); + + // Release the counted queue + ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue)); + }; + + constexpr int kThreads = 2; + std::vector threads; + for (int i = 0; i < kThreads; i++) { + threads.emplace_back(func); + } + + for (auto& th : threads) { + th.join(); + } + + // With GPU_MAX_HW_QUEUES=1, all threads should share the same HW queue + // Check if largest useCount is same as the number of user apps accessing queues + EXPECT_EQ(maxUseCount.load(), kThreads); + + // All HW IDs should be the same (only 1 HW queue created) + EXPECT_EQ(allHwIds.size(), static_cast(kThreads)); + for (size_t i = 1; i < allHwIds.size(); i++) { + EXPECT_EQ(allHwIds[i], allHwIds[0]); + } + + hsa_amd_memory_pool_free(shared_src_buffer); +} + +void CountedQueuesTest::CountedQueuesOverflowWrapAroundTest() { + hsa_status_t status; + + // Common setup + ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this)); + ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this)); + + // Load kernel + set_kernel_file_name("test_case_template_kernels.hsaco"); + set_kernel_name("square"); + ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, gpu_device1())); + + hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()}; + + void* shared_src_buffer = nullptr; + ASSERT_SUCCESS( + hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &shared_src_buffer)); + ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, shared_src_buffer)); + + for (uint32_t i = 0; i < 256; ++i) { + reinterpret_cast(shared_src_buffer)[i] = i; + } + + // To verify that after the queue has been used up, next index wraps around + std::atomic maxIndexSeen{0}; + + auto func = [&]() { + // local dest buffer for each user application + void* local_dst_buffer = nullptr; + ASSERT_SUCCESS( + hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &local_dst_buffer)); + ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, local_dst_buffer)); + + // Local completion signal for every user application + hsa_signal_t local_signal; + ASSERT_SUCCESS(hsa_signal_create(1, 0, nullptr, &local_signal)); + + // Get a counted queue + hsa_queue_t* queue = nullptr; + ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(*gpu_device1(), HSA_QUEUE_TYPE_MULTI, + HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, + &queue)); + EXPECT_NE(queue, nullptr); + + if (queue == nullptr) { + hsa_signal_destroy(local_signal); + hsa_amd_memory_pool_free(local_dst_buffer); + return; + } + + uint32_t queue_size = queue->size; // should be 16384 + const uint32_t queue_mask = queue_size - 1; // used for index wraparound + + struct __attribute__((aligned(16))) local_args_t { + uint32_t* dstArray; + uint32_t* srcArray; + uint32_t size; + uint32_t pad; + uint64_t global_offset_x; + uint64_t global_offset_y; + uint64_t global_offset_z; + uint64_t printf_buffer; + uint64_t default_queue; + uint64_t completion_action; + } local_args; + + local_args.dstArray = reinterpret_cast(local_dst_buffer); + local_args.srcArray = reinterpret_cast(shared_src_buffer); + local_args.size = 256; + local_args.global_offset_x = 0; + local_args.global_offset_y = 0; + local_args.global_offset_z = 0; + local_args.printf_buffer = 0; + local_args.default_queue = 0; + local_args.completion_action = 0; + + void* kernarg_address = nullptr; + ASSERT_SUCCESS( + hsa_amd_memory_pool_allocate(kern_arg_pool(), sizeof(local_args), 0, &kernarg_address)); + ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, kernarg_address)); + + memcpy(kernarg_address, &local_args, sizeof(local_args)); + + // Dispatch more packets than queue size to force overflow and ensure that indices wrap around + int it = queue_size + 5; + + for (int i = 0; i < it; i++) { + // Reserve a slot in the queue + uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1); + + uint64_t curr_max = maxIndexSeen.load(); + while (index > curr_max && !maxIndexSeen.compare_exchange_weak(curr_max, index)) { + } + + // Get pointer to the reserved packet slot using wraparound masking + uint64_t wrapped_index = index & queue_mask; + hsa_kernel_dispatch_packet_t* queue_aql_packet = + &(reinterpret_cast(queue->base_address))[wrapped_index]; + + // Fill packet fields + queue_aql_packet->setup = 1; + queue_aql_packet->workgroup_size_x = 256; + queue_aql_packet->workgroup_size_y = 1; + queue_aql_packet->workgroup_size_z = 1; + queue_aql_packet->grid_size_x = 256; + queue_aql_packet->grid_size_y = 1; + queue_aql_packet->grid_size_z = 1; + queue_aql_packet->private_segment_size = 0; + queue_aql_packet->group_segment_size = 0; + queue_aql_packet->kernel_object = kernel_object(); + queue_aql_packet->kernarg_address = kernarg_address; + queue_aql_packet->completion_signal = local_signal; + + // Write header for packet + uint32_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + __atomic_store_n(reinterpret_cast(&queue_aql_packet->header), header, + __ATOMIC_RELEASE); + + // Ring doorbell to notify GPU + hsa_signal_store_screlease(queue->doorbell_signal, index); + + // Wait for completion signal to be less than 1 + while (hsa_signal_wait_scacquire(local_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, + HSA_WAIT_STATE_ACTIVE)) { + } + + // Reset signal for next iteration + hsa_signal_store_screlease(local_signal, 1); + + // Verify results are still correct after wraparound + ASSERT_TRUE(VerifyResult(reinterpret_cast(local_dst_buffer), 256)); + } + + // Cleanup + hsa_amd_memory_pool_free(kernarg_address); + hsa_signal_destroy(local_signal); + hsa_amd_memory_pool_free(local_dst_buffer); + + // Release the counted queue + ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue)); + }; + + constexpr int kThreads = 2; + std::vector threads; + for (int i = 0; i < kThreads; i++) { + threads.emplace_back(func); + } + + for (auto& th : threads) { + th.join(); + } + + // Verify value of max seen index + uint64_t maxId = maxIndexSeen.load(); + EXPECT_EQ(maxId, (16384 + 5) * kThreads - 1); + + hsa_amd_memory_pool_free(shared_src_buffer); +} \ No newline at end of file diff --git a/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.h b/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.h new file mode 100644 index 0000000000..6279343a2c --- /dev/null +++ b/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.h @@ -0,0 +1,68 @@ +/* + * Copyright © Advanced Micro Devices, Inc., or its affiliates. + * + * SPDX-License-Identifier: MIT + */ + +#ifndef ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H +#define ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H + + +#include "suites/test_common/test_base.h" + +class CountedQueuesTest : public TestBase { + public: + explicit CountedQueuesTest(); + + // @Brief: Destructor for test case of CountedQueuesTest + virtual ~CountedQueuesTest(); + + // @Brief: Setup the environment for measurement + virtual void SetUp(); + + // @Brief: Core measurement execution + virtual void Run(); + + // @Brief: Clean up and retrive the resource + virtual void Close(); + + // @Brief: Display results + virtual void DisplayResults() const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); + + /// @brief Basic API test to acquire, query and release 1 HW queue + void CountedQueueBasicApiTest(); + + /// @brief This test verifies that when many queues of the same priority are created, + // they are evenly distributed across the limited set of hardware queues, reuse those + // hardware queues correctly, and report proper use-counts before and after release + void CountedQueues_SamePriority_MaxLimitTest(); + + // @brief Test to verify HSA status codes for incorrect arguments sent via API + void InvalidArgsTest(); + + // @brief Test to verify that counted queues across all priorities each reuse only + // their own priority-specific hardware queues + void CountedQueuesAllPrioritiesLimitTest(); + + /// @brief Test to verify hsa_amd_queue_set_priority() does not work on counted queues + void CountedQueuesSetPriorityNackTest(); + + /// @brief Test to verify hsa_amd_queue_cu_set_mask() does not work on counted queues + void CountedQueuesSetCUMaskNackTest(); + + /// @brief Test to verify that a counted queue correctly supports kernel dispatches end-to-end + void CountedQueuesDispatchTest(); + + /// @brief Test to verify kernel dispatches onto shared queues from multiple user apps even when they + // all share the same HW queue + void CountedQueuesMultithreadedDispatchTest(); + + /// @brief Test to verify ring buffer wrap around when more than queue_size number of + // AQL packets are enqueued + void CountedQueuesOverflowWrapAroundTest(); +}; + +#endif // ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H \ No newline at end of file diff --git a/projects/rocr-runtime/rocrtst/suites/test_common/main.cc b/projects/rocr-runtime/rocrtst/suites/test_common/main.cc index e0020be00c..2ae0df15ef 100644 --- a/projects/rocr-runtime/rocrtst/suites/test_common/main.cc +++ b/projects/rocr-runtime/rocrtst/suites/test_common/main.cc @@ -82,6 +82,7 @@ #include "suites/functional/filter_devices.h" #include "amd_smi/amdsmi.h" #include "common/common.h" +#include "suites/functional/counted_queues.h" #include "common/os.h" static RocrTstGlobals *sRocrtstGlvalues = nullptr; @@ -507,6 +508,69 @@ TEST(rocrtstFunc, Filter_Devices_Test) { ); } +TEST(rocrtstFunc, Counted_Queue_Basic_Test) { + CountedQueuesTest cq; + RunCustomTestProlog(&cq); + cq.CountedQueueBasicApiTest(); + RunCustomTestEpilog(&cq); +} + +TEST(rocrtstFunc, Counted_Queue_Same_Priority_Max_Limit_Test) { + CountedQueuesTest cq; + RunCustomTestProlog(&cq); + cq.CountedQueues_SamePriority_MaxLimitTest(); + RunCustomTestEpilog(&cq); +} + +TEST(rocrtstFunc, Counted_Queue_Invalid_Args_Test) { + CountedQueuesTest cq; + RunCustomTestProlog(&cq); + cq.InvalidArgsTest(); + RunCustomTestEpilog(&cq); +} + +TEST(rocrtstFunc, Counted_Queue_Multiple_Priorities_Limit_Test) { + CountedQueuesTest cq; + RunCustomTestProlog(&cq); + cq.CountedQueuesAllPrioritiesLimitTest(); + RunCustomTestEpilog(&cq); +} + +TEST(rocrtstFunc, Counted_Queue_Set_Priority_Nack_Test) { + CountedQueuesTest cq; + RunCustomTestProlog(&cq); + cq.CountedQueuesSetPriorityNackTest(); + RunCustomTestEpilog(&cq); +} + +TEST(rocrtstFunc, Counted_Queue_Set_CUMask_Nack_Test) { + CountedQueuesTest cq; + RunCustomTestProlog(&cq); + cq.CountedQueuesSetCUMaskNackTest(); + RunCustomTestEpilog(&cq); +} + +TEST(rocrtstFunc, Counted_Queue_Dispatch_Test) { + CountedQueuesTest cq; + RunCustomTestProlog(&cq); + cq.CountedQueuesDispatchTest(); + RunCustomTestEpilog(&cq); +} + +TEST(rocrtstFunc, Counted_Queue_Multithreaded_Dispatch_Test) { + CountedQueuesTest cq; + RunCustomTestProlog(&cq); + cq.CountedQueuesMultithreadedDispatchTest(); + RunCustomTestEpilog(&cq); +} + +TEST(rocrtstFunc, Counted_Queue_Overflow_And_Wraparound_Test) { + CountedQueuesTest cq; + RunCustomTestProlog(&cq); + cq.CountedQueuesOverflowWrapAroundTest(); + RunCustomTestEpilog(&cq); +} + TEST(rocrtstNeg, Memory_Negative_Tests) { RUN_IF_NOT_EMU_MODE( MemoryAllocateNegativeTest mt; diff --git a/projects/rocr-runtime/rocrtst/suites/test_common/main.h b/projects/rocr-runtime/rocrtst/suites/test_common/main.h index 67b961c57d..92897efc3c 100755 --- a/projects/rocr-runtime/rocrtst/suites/test_common/main.h +++ b/projects/rocr-runtime/rocrtst/suites/test_common/main.h @@ -54,6 +54,5 @@ test; \ } - #endif // ROCRTST_SUITES_TEST_COMMON_MAIN_H_ diff --git a/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt b/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt index 6f0d074c43..b8aac0e5bf 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt +++ b/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt @@ -217,6 +217,7 @@ set ( SRCS core/driver/driver.cpp core/runtime/cache.cpp core/runtime/svm_profiler.cpp core/runtime/thunk_loader.cpp + core/runtime/counted_queue_manager.cpp core/common/hsa_table_interface.cpp loader/executable.cpp libamdhsacode/amd_elf_image.cpp diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp index b8580ca955..f367b668fa 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp @@ -1345,6 +1345,18 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi size_copied, status); } +hsa_status_t HSA_API hsa_amd_counted_queue_acquire( + hsa_agent_t agent, hsa_queue_type_t type, hsa_amd_queue_priority_t priority, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data, + uint64_t flags, hsa_queue_t** queue) { + return amdExtTable->hsa_amd_counted_queue_acquire_fn(agent, type, priority, callback, data, flags, + queue); +} + +hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue) { + return amdExtTable->hsa_amd_counted_queue_release_fn(queue); +} + // Tools only table interfaces. namespace rocr { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp index 7b858961fa..915cbcf40e 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp @@ -64,6 +64,28 @@ extern r_debug _amdgpu_r_debug; namespace rocr { + +/// @brief Mapping between priority type used internally within ROCR to the type used by KFD + +// Highest queue priority allowed for HSA user is HSA_QUEUE_PRIORITY_HIGH +// HSA_QUEUE_PRIORITY_MAXIMUM is reserved for PC Sampling and can only be allocated internally +// in ROCR +__forceinline HSA_QUEUE_PRIORITY HsaInternalToKfdPriority( + rocr::HSA::hsa_amd_queue_priority_internal_t priority) { + switch (priority) { + case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_LOW: + return HSA_QUEUE_PRIORITY_MINIMUM; + case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL: + return HSA_QUEUE_PRIORITY_NORMAL; + case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_HIGH: + return HSA_QUEUE_PRIORITY_HIGH; + case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_MAXIMUM: + return HSA_QUEUE_PRIORITY_MAXIMUM; + default: + return HSA_QUEUE_PRIORITY_NORMAL; + } +} + namespace AMD { #if defined(__linux__) @@ -370,10 +392,13 @@ hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) { } hsa_status_t KfdDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct, - HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, + HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event, HsaQueueResource& queue_resource) const { - if (HSAKMT_CALL(hsaKmtCreateQueueExt(node_id, type, queue_pct, priority, sdma_engine_id, + // Convert from ROCR internal priority type to KFD type + HSA_QUEUE_PRIORITY kfd_priority = HsaInternalToKfdPriority(priority); + + if (HSAKMT_CALL(hsaKmtCreateQueueExt(node_id, type, queue_pct, kfd_priority, sdma_engine_id, queue_addr, queue_size_bytes, event, &queue_resource)) != HSAKMT_STATUS_SUCCESS) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; @@ -389,9 +414,12 @@ hsa_status_t KfdDriver::DestroyQueue(HSA_QUEUEID queue_id) const { } hsa_status_t KfdDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, - HSA_QUEUE_PRIORITY priority, void* queue_addr, + HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr, uint64_t queue_size, HsaEvent* event) const { - if (HSAKMT_CALL(hsaKmtUpdateQueue(queue_id, queue_pct, priority, queue_addr, queue_size, + // Convert from ROCR internal priority type to KFD type + HSA_QUEUE_PRIORITY kfd_priority = HsaInternalToKfdPriority(priority); + + if (HSAKMT_CALL(hsaKmtUpdateQueue(queue_id, queue_pct, kfd_priority, queue_addr, queue_size, event)) != HSAKMT_STATUS_SUCCESS) { return HSA_STATUS_ERROR; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/virtio/amd_kfd_virtio_driver.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/virtio/amd_kfd_virtio_driver.cpp index efd32709ca..4289a9b164 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/virtio/amd_kfd_virtio_driver.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/virtio/amd_kfd_virtio_driver.cpp @@ -430,7 +430,7 @@ hsa_status_t KfdVirtioDriver::MakeMemoryUnresident(const void* mem) const { } hsa_status_t KfdVirtioDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct, - HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, + HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event, HsaQueueResource& queue_resource) const { if (vhsaKmtCreateQueueExt(node_id, type, queue_pct, priority, sdma_engine_id, queue_addr, @@ -447,7 +447,7 @@ hsa_status_t KfdVirtioDriver::DestroyQueue(HSA_QUEUEID queue_id) const { } hsa_status_t KfdVirtioDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_percentage, - HSA_QUEUE_PRIORITY priority, void* queue_mem, + HSA::hsa_amd_queue_priority_internal_t priority, void* queue_mem, uint64_t queue_size, HsaEvent* event) const { return HSA_STATUS_ERROR; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index a1ca73c32a..c6195f10b5 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -383,7 +383,7 @@ hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) { } hsa_status_t XdnaDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct, - HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, + HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event, HsaQueueResource& queue_resource) const { queue_resource.QueueId = AMDXDNA_INVALID_CTX_HANDLE; @@ -407,7 +407,7 @@ hsa_status_t XdnaDriver::DestroyQueue(HSA_QUEUEID queue_id) const { } hsa_status_t XdnaDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, - HSA_QUEUE_PRIORITY priority, void* queue_addr, + HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr, uint64_t queue_size, HsaEvent* event) const { // AIE doesn't support queue updates. return HSA_STATUS_ERROR_INVALID_QUEUE; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index d007d00c2c..e00867e07f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -73,7 +73,7 @@ class AieAqlQueue : public core::Queue, ~AieAqlQueue(); hsa_status_t Inactivate() override; - hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override; + hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override; void Destroy() override; uint64_t LoadReadIndexRelaxed() override; uint64_t LoadReadIndexAcquire() override; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h index ef80f69776..de7a43be28 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aql_queue.h @@ -73,7 +73,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo hsa_status_t Inactivate() override; /// @brief Change the scheduling priority of the queue - hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override; + hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override; /// @brief Destroy ref counted queue void Destroy() override; @@ -316,7 +316,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo bool suspended_; // Thunk dispatch and wavefront scheduling priority - HSA_QUEUE_PRIORITY priority_; + HSA::hsa_amd_queue_priority_internal_t priority_; // Exception notification signal Signal* exception_signal_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index c799cd8611..df5e4cf21a 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -62,6 +62,7 @@ #include "core/util/locks.h" #include "core/util/small_heap.h" #include "pcs/pcs_runtime.h" +#include "core/inc/counted_queue_manager.h" namespace rocr { namespace AMD { @@ -342,6 +343,16 @@ class GpuAgent : public GpuAgentInt { void AcquireQueueAltScratch(ScratchInfo& scratch) override; void ReleaseQueueAltScratch(ScratchInfo& scratch) override; + // @brief Create a pool of shared queues for multiple user applications within a max limit + hsa_status_t AcquireCountedQueue(hsa_queue_type_t type, + HSA::hsa_amd_queue_priority_internal_t priority, + void (*callback)(hsa_status_t, hsa_queue_t*, void*), + void* data, uint64_t flags, + hsa_queue_t** out_queue); + + // @brief Release a queue earlier used by application + hsa_status_t ReleaseCountedQueue(hsa_queue_t* queue); + // @brief Override from AMD::GpuAgentInt. void TranslateTime(core::Signal* signal, hsa_amd_profiling_dispatch_time_t& time) override; @@ -645,6 +656,9 @@ class GpuAgent : public GpuAgentInt { // @brief HSA profile. hsa_profile_t profile_; + // @brief Pool of shared queues owned by this agent + rocr::core::CountedQueuePoolManager queue_pool_; + void* trap_code_buf_; size_t trap_code_buf_size_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_kfd_driver.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_kfd_driver.h index d811ff8365..31327ce7dc 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_kfd_driver.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_kfd_driver.h @@ -97,10 +97,10 @@ public: uint32_t node_id) override; hsa_status_t FreeMemory(void *mem, size_t size) override; hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct, - HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr, + HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event, HsaQueueResource& queue_resource) const override; - hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority, + hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr, uint64_t queue_size, HsaEvent* event) const override; hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override; hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count, diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_virtio_driver.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_virtio_driver.h index fd229e94f6..994f3a28c8 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_virtio_driver.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_virtio_driver.h @@ -90,12 +90,12 @@ class KfdVirtioDriver final : public core::Driver { const uint32_t* nodes) const override; hsa_status_t MakeMemoryUnresident(const void* mem) const override; hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct, - HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr, + HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event, HsaQueueResource& queue_resource) const override; hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override; hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_percentage, - HSA_QUEUE_PRIORITY priority, void* queue_mem, uint64_t queue_size, + HSA::hsa_amd_queue_priority_internal_t priority, void* queue_mem, uint64_t queue_size, HsaEvent* event) const override; hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t num_cu_mask, uint32_t* cu_mask) const override; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 46b3dd2ea0..f730a8d13f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -204,10 +204,10 @@ public: uint32_t node_id) override; hsa_status_t FreeMemory(void *mem, size_t size) override; hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct, - HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr, + HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event, HsaQueueResource& queue_resource) const override; - hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority, + hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr, uint64_t queue_size, HsaEvent* event) const override; hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override; hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count, diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/counted_queue_manager.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/counted_queue_manager.h new file mode 100644 index 0000000000..e7b05dc318 --- /dev/null +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/counted_queue_manager.h @@ -0,0 +1,68 @@ +/* + * Copyright © Advanced Micro Devices, Inc., or its affiliates. + * + * SPDX-License-Identifier: MIT + */ + +#ifndef HSA_RUNTIME_CORE_INC_COUNTED_QUEUE_MANAGER_H_ +#define HSA_RUNTIME_CORE_INC_COUNTED_QUEUE_MANAGER_H_ + + +#include "hsa.h" +#include "hsa_ext_amd.h" +#include "core/inc/agent.h" +#include "core/inc/runtime.h" +#include +#include +#include +#include + +namespace rocr { +namespace core { + +// Wrapper for a logical counted queue (unique handle + callback) +struct CountedQueue { + core::Queue* hw_queue; // this will store the public handle of HW Queue (hsa_queue_t) + void (*callback)(hsa_status_t, hsa_queue_t*, void*); + void* callback_data; + + CountedQueue(core::Queue* hw, void (*cb)(hsa_status_t, hsa_queue_t*, void*), void* data) + : hw_queue(hw), callback(cb), callback_data(data) {} +}; + +// Manages the pool of counted queues for a single GPU agent +class CountedQueuePoolManager { + public: + explicit CountedQueuePoolManager(core::Agent*); + + // Acquire a queue (either reuse or create new) + hsa_status_t AcquireQueue(hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority, + void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data, + uint64_t flags, hsa_queue_t** out_queue); + + // Release a logical queue + hsa_status_t ReleaseQueue(hsa_queue_t* queue); + + // Called during hsa_shutdown to remove all user and CP queues + void Cleanup(); + + private: + core::Queue* FindOrCreateHardwareQueue(hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority, + void (*callback)(hsa_status_t, hsa_queue_t*, void*), + void* data, uint64_t flags); + + core::Agent* agent_; // pointer to the gpu agent that owns this pool + uint32_t max_hw_queues_; + std::mutex mutex_; + + // Pool of hw queues by priority on the agent + std::map> hw_queue_pools_; + + // Map from unique handle to CountedQueue (hw queue, metadata per acquire request) + std::map> counted_queues_; +}; + +} // namespace core +} // namespace rocr + +#endif // HSA_RUNTIME_CORE_INC_COUNTED_QUEUE_MANAGER_H_ diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/driver.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/driver.h index d174c796f1..4f6297e214 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/driver.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/driver.h @@ -50,6 +50,7 @@ #include "core/inc/memory_region.h" #include "hsakmt/hsakmttypes.h" #include "inc/hsa.h" +#include "core/inc/hsa_internal.h" namespace rocr { namespace core { @@ -159,7 +160,7 @@ public: /// @param[in] event HsaEvent for event-driven callbacks. /// @param[out] queue_resource Queue resource information populated by the driver. virtual hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct, - HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, + HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event, HsaQueueResource& queue_resource) const = 0; @@ -175,7 +176,7 @@ public: /// @param[in] queue_size_bytes Size of the queue's ring buffer in bytes. /// @param[in] event HsaEvent for event-driven callbacks. virtual hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, - HSA_QUEUE_PRIORITY priority, void* queue_addr, + HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event) const = 0; /// @brief Set the CU mask for a queue. diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h index 5464c53a91..b6b219b446 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/host_queue.h @@ -60,7 +60,7 @@ class HostQueue : public Queue { ~HostQueue(); hsa_status_t Inactivate() override { return HSA_STATUS_SUCCESS; } - hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override { + hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override { return HSA_STATUS_ERROR_INVALID_QUEUE; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h index 68288f2d0a..a128fb54ff 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h @@ -381,6 +381,15 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi uint64_t size, int64_t file_offset, uint64_t *size_copied, int32_t *status); +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_counted_queue_acquire(hsa_agent_t agent, hsa_queue_type_t type, + hsa_amd_queue_priority_t priority, + void (*callback)(hsa_status_t status, + hsa_queue_t* source, void* data), + void* data, uint64_t flags, hsa_queue_t** queue); +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue); + // Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_enable_logging(uint8_t* flags, void* file); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_internal.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_internal.h index 74069e08b5..e29260d62d 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_internal.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_internal.h @@ -44,10 +44,18 @@ #define HSA_RUNTIME_CORE_INC_HSA_INTERNAL_H #include "inc/hsa.h" +#include "inc/hsa_ext_amd.h" namespace rocr { namespace HSA { + typedef enum hsa_amd_queue_priority_internal_s { + HSA_AMD_QUEUE_PRIORITY_LOW = HSA_AMD_QUEUE_PRIORITY_LOW, + HSA_AMD_QUEUE_PRIORITY_NORMAL = HSA_AMD_QUEUE_PRIORITY_NORMAL, + HSA_AMD_QUEUE_PRIORITY_HIGH = HSA_AMD_QUEUE_PRIORITY_HIGH, + HSA_AMD_QUEUE_PRIORITY_MAXIMUM = HSA_AMD_QUEUE_PRIORITY_HIGH + 1, + } hsa_amd_queue_priority_internal_t; + // Define core namespace interfaces - copy of function declarations in hsa.h hsa_status_t hsa_init(); hsa_status_t hsa_shut_down(); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h index f7e1d18fec..c8849a26b3 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h @@ -66,7 +66,7 @@ class QueueWrapper : public Queue { explicit QueueWrapper(std::unique_ptr queue) : Queue(static_cast(core::Runtime::runtime_singleton_->system_allocator()( sizeof(core::SharedQueue), 4096, 0, 0)), - 0), + 0, nullptr), wrapped(std::move(queue)) { memcpy(&amd_queue_, &wrapped->amd_queue_, sizeof(amd_queue_)); wrapped->set_public_handle(wrapped.get(), public_handle_); @@ -77,7 +77,7 @@ class QueueWrapper : public Queue { } hsa_status_t Inactivate() override { return wrapped->Inactivate(); } - hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override { + hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override { return wrapped->SetPriority(priority); } uint64_t LoadReadIndexAcquire() override { return wrapped->LoadReadIndexAcquire(); } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h index 2a3c551be1..ffc5fb3895 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h @@ -168,12 +168,15 @@ All funtions other than Convert and public_handle must be virtual. */ class Queue : public Checked<0xFA3906A679F9DB49> { public: - Queue(SharedQueue* shared_queue, uint64_t queue_flags) - : Queue(shared_queue, queue_flags, false) {} + Queue(SharedQueue* shared_queue, uint64_t queue_flags, core::Agent* agent) + : Queue(shared_queue, queue_flags, false, agent) {} - Queue(SharedQueue* shared_queue, uint64_t queue_flags, bool pcie_write_ordering) + Queue(SharedQueue* shared_queue, uint64_t queue_flags, bool pcie_write_ordering, core::Agent* agent) : amd_queue_(shared_queue->amd_queue), + use_count(0), + is_counted_queue(false), shared_queue_(shared_queue), + agent_(agent), flags_(queue_flags), pcie_write_ordering_(pcie_write_ordering) { public_handle_ = Convert(this); @@ -213,7 +216,7 @@ class Queue : public Checked<0xFA3906A679F9DB49> { virtual hsa_status_t Inactivate() = 0; /// @brief Change the scheduling priority of the queue - virtual hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) = 0; + virtual hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) = 0; /// @brief Reads the Read Index of Queue using Acquire semantics /// @@ -372,6 +375,13 @@ class Queue : public Checked<0xFA3906A679F9DB49> { hsa_queue_t* public_handle() const { return public_handle_; } + // Get a pointer to the agent that owns this queue + core::Agent* GetAgent() { return agent_; } + + // @brief Attributes specifically for counted queue types + uint32_t use_count; + bool is_counted_queue; + typedef void* rtti_t; bool IsType(rtti_t id) { return _IsA(id); } @@ -401,6 +411,8 @@ class Queue : public Checked<0xFA3906A679F9DB49> { SharedQueue* shared_queue_; + core::Agent* agent_; // pointer to the agent that owns this queue + hsa_queue_t* public_handle_; /// Next available queue id. diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index 4b22f5419e..207f83950a 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -67,7 +67,7 @@ namespace AMD { AieAqlQueue::AieAqlQueue(core::SharedQueue* shared_queue, AieAgent* agent, size_t req_size_pkts, uint32_t node_id, uint64_t flags) - : Queue(shared_queue, flags), + : Queue(shared_queue, flags, agent), LocalSignal(0, false), DoorbellSignal(signal()), agent_(*agent), @@ -104,7 +104,7 @@ AieAqlQueue::AieAqlQueue(core::SharedQueue* shared_queue, AieAgent* agent, size_ HsaQueueResource queue_resource = {}; hsa_status_t status = - agent_.driver().CreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 0, HSA_QUEUE_PRIORITY_NORMAL, 0, + agent_.driver().CreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 0, rocr::HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL, 0, nullptr, queue_size_bytes_, nullptr, queue_resource); if (status != HSA_STATUS_SUCCESS) { throw AMD::hsa_exception(status, "Failed to create a hardware context for an AIE queue."); @@ -135,7 +135,7 @@ hsa_status_t AieAqlQueue::Inactivate() { return status; } -hsa_status_t AieAqlQueue::SetPriority(HSA_QUEUE_PRIORITY priority) { +hsa_status_t AieAqlQueue::SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) { return HSA_STATUS_SUCCESS; } @@ -277,6 +277,13 @@ hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute, // Hardware doorbell supports AQL semantics. *static_cast(value) = reinterpret_cast(signal_.hardware_doorbell_ptr); break; + case HSA_QUEUE_INFO_USE_COUNT: + // AIE queues do not support counted queue features + *static_cast(value) = static_cast(-1); + break; + case HSA_QUEUE_INFO_HW_ID: + *static_cast(value) = public_handle()->id; + break; default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index ff05e8ecc1..ccc20b8d19 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -81,7 +81,7 @@ namespace AMD { AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, ScratchInfo& scratch, core::HsaEventCallback callback, void* err_data, uint64_t flags) - : Queue(shared_queue, flags, !agent->is_xgmi_cpu_gpu()), + : Queue(shared_queue, flags, !agent->is_xgmi_cpu_gpu(), agent), LocalSignal(0, false), DoorbellSignal(signal()), ring_buf_(nullptr), @@ -97,7 +97,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_ dynamicScratchState(0), exceptionState(0), suspended_(false), - priority_(HSA_QUEUE_PRIORITY_NORMAL), + priority_(HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL), exception_signal_(nullptr) { // Queue size is a function of several restrictions. @@ -495,6 +495,21 @@ hsa_status_t AqlQueue::GetInfo(hsa_queue_info_attribute_t attribute, void* value *(reinterpret_cast(value)) = reinterpret_cast(signal_.hardware_doorbell_ptr); break; + case HSA_QUEUE_INFO_USE_COUNT: + if (!is_counted_queue) { + *static_cast(value) = static_cast(-1); + } else { + if (use_count == 0) { + // Queue was released + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + *static_cast(value) = use_count; + } + break; + case HSA_QUEUE_INFO_HW_ID: + // Return the hardware queue ID for both counted and non-counted queues + *static_cast(value) = public_handle()->id; + break; default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -631,7 +646,7 @@ hsa_status_t AqlQueue::Inactivate() { return HSA_STATUS_SUCCESS; } -hsa_status_t AqlQueue::SetPriority(HSA_QUEUE_PRIORITY priority) { +hsa_status_t AqlQueue::SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) { if (suspended_) { return HSA_STATUS_ERROR_INVALID_QUEUE; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp index d1681f968a..18c024a246 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -184,7 +184,7 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent, bool use_xgm // boolean flag const HSA_QUEUE_TYPE kQueueType_ = rec_eng >= 0 ? HSA_QUEUE_SDMA_BY_ENG_ID : (use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA); - if (agent_->driver().CreateQueue(agent_->node_id(), kQueueType_, 100, HSA_QUEUE_PRIORITY_MAXIMUM, + if (agent_->driver().CreateQueue(agent_->node_id(), kQueueType_, 100, HSA::HSA_AMD_QUEUE_PRIORITY_MAXIMUM, rec_eng, queue_start_addr_, kQueueSize, nullptr, queue_resource_) != HSA_STATUS_SUCCESS) { LogPrint(HSA_AMD_LOG_FLAG_INFO, "Failed to create queue, size=%d, type=%d," diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 01b01fe869..4f2019c4ef 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -99,6 +99,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT), scratch_used_large_(0), queues_(), + queue_pool_(this), trap_code_buf_(NULL), trap_code_buf_size_(0), doorbell_queue_map_(NULL), @@ -774,13 +775,13 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) { void GpuAgent::InitDma() { // Setup lazy init pointers on queues and blits. - auto queue_lambda = [this](HSA_QUEUE_PRIORITY priority = HSA_QUEUE_PRIORITY_NORMAL) { + auto queue_lambda = [this](HSA::hsa_amd_queue_priority_internal_t priority = HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL) { auto queue = CreateInterceptibleQueue(); if (queue == nullptr) throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Internal queue creation failed."); - if (priority != HSA_QUEUE_PRIORITY_NORMAL) + if (priority != HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL) if (queue->SetPriority(priority) != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(HSA_STATUS_ERROR, "Failed to increase queue priority for PC Sampling"); @@ -795,7 +796,7 @@ void GpuAgent::InitDma() { // Dedicated compute queue for PC Sampling CP-DMA commands. We need a dedicated queue that runs at // highest priority because we do not want the CP-DMA commands to be delayed/blocked due to // other dispatches/barriers that could be in the other AQL queues. - queues_[QueuePCSampling].reset([queue_lambda]() { return queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM); }); + queues_[QueuePCSampling].reset([queue_lambda]() { return queue_lambda(HSA::HSA_AMD_QUEUE_PRIORITY_MAXIMUM); }); // Decide which engine to use for blits. auto blit_lambda = [this](bool use_xgmi, lazy_ptr& queue, bool isHostToDev, uint32_t rec_eng) { @@ -924,6 +925,10 @@ void GpuAgent::PreloadBlits() { void GpuAgent::ReleaseResources() { if (this->Enabled()) { this->Disable(); + + // Remove all shared hardware queues from pool + queue_pool_.Cleanup(); + for (auto& blit : blits_) { if (!blit.empty()) { hsa_status_t status = blit->Destroy(); @@ -1829,6 +1834,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u auto aql_queue = new AqlQueue(shared_queue, this, size, node_id(), scratch, event_callback, data, flags); + *queue = aql_queue; aql_queues_.push_back(aql_queue); @@ -3519,5 +3525,17 @@ hsa_status_t GpuAgent::PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& sessi return HSA_STATUS_SUCCESS; } +hsa_status_t GpuAgent::AcquireCountedQueue(hsa_queue_type_t type, + HSA::hsa_amd_queue_priority_internal_t priority, + void (*callback)(hsa_status_t, hsa_queue_t*, void*), + void* data, uint64_t flags, + hsa_queue_t** out_queue) { + return queue_pool_.AcquireQueue(type, priority, callback, data, flags, out_queue); +} + +hsa_status_t GpuAgent::ReleaseCountedQueue(hsa_queue_t* queue) { + return queue_pool_.ReleaseQueue(queue); +} + } // namespace amd } // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/counted_queue_manager.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/counted_queue_manager.cpp new file mode 100644 index 0000000000..675129e776 --- /dev/null +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/counted_queue_manager.cpp @@ -0,0 +1,145 @@ +/* + * Copyright © Advanced Micro Devices, Inc., or its affiliates. + * + * SPDX-License-Identifier: MIT + */ + +#include "core/inc/counted_queue_manager.h" +#include "core/inc/agent.h" +#include "core/inc/runtime.h" + +namespace rocr { +namespace core { + +constexpr size_t DEFAULT_QUEUE_SIZE = 16384; + +CountedQueuePoolManager::CountedQueuePoolManager(core::Agent* agent) : agent_(agent) { + // Read in GPU_MAX_HW_QUEUES flag value + max_hw_queues_ = core::Runtime::runtime_singleton_->flag().cp_queues_limit(); +} + +hsa_status_t CountedQueuePoolManager::AcquireQueue( + hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority, + void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data, uint64_t flags, + hsa_queue_t** out_queue) { + std::lock_guard lock(mutex_); + + core::Queue* core_queue = FindOrCreateHardwareQueue(type, priority, callback, data, flags); + if (!core_queue) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + // Create unique SharedQueue structure and store the unique handle in it + SharedQueue* shared_queue = new (std::nothrow) SharedQueue(); + if (!shared_queue) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + // Copy amd_queue from HW queue + shared_queue->amd_queue = core_queue->amd_queue_; + + // Point to the SAME underlying core::Queue (shared HW queue) + shared_queue->core_queue = core_queue; + + // Create a unique handle from this new SharedQueue + hsa_queue_t* unique_handle = &shared_queue->amd_queue.hsa_queue; + + // Track metadata + auto counted_q = std::make_unique(core_queue, callback, data); + counted_queues_[unique_handle] = std::move(counted_q); + + // Increment use count + core_queue->use_count++; + + // Mark as a counted queue, if not already set + if (!core_queue->is_counted_queue) { + core_queue->is_counted_queue = true; + } + + *out_queue = unique_handle; + return HSA_STATUS_SUCCESS; +} + +core::Queue* CountedQueuePoolManager::FindOrCreateHardwareQueue( + hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority, + void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data, uint64_t flags) { + auto& pool = hw_queue_pools_[priority]; + + // Reuse least-used queue if max reached + if (pool.size() >= max_hw_queues_) { + core::Queue* least_used = nullptr; + uint32_t min_count = UINT32_MAX; + + for (auto* q : pool) { + if (q->use_count < min_count) { + min_count = q->use_count; + least_used = q; + } + } + return least_used; + } + + // Create a new hardware queue + core::Queue* cmd_queue = nullptr; + hsa_status_t status = + agent_->QueueCreate(DEFAULT_QUEUE_SIZE, type, 0, callback, data, 0, 0, &cmd_queue); + if (status != HSA_STATUS_SUCCESS) return nullptr; + + status = cmd_queue->SetPriority(priority); + if (status != HSA_STATUS_SUCCESS) return nullptr; + + cmd_queue->SetProfiling(true); + + // Add to pool + pool.push_back(cmd_queue); + return cmd_queue; +} + +hsa_status_t CountedQueuePoolManager::ReleaseQueue(hsa_queue_t* queue) { + std::lock_guard lock(mutex_); + auto it = counted_queues_.find(queue); + if (it == counted_queues_.end()) return HSA_STATUS_ERROR; + + CountedQueue* counted_q = it->second.get(); + + // Decrement internal ref count inside core::Queue object + if (counted_q->hw_queue->use_count > 0) { + counted_q->hw_queue->use_count--; + + // Remove unique handle from map when it is no longer in use by an application + if (counted_q->hw_queue->use_count == 0) { + counted_queues_.erase(queue); + + // free the associated shared_queue when removing the counted_queue + SharedQueue* shared = reinterpret_cast( + reinterpret_cast(queue) - offsetof(SharedQueue, amd_queue.hsa_queue)); + delete shared; + } + } + + return HSA_STATUS_SUCCESS; +} + +void CountedQueuePoolManager::Cleanup() { + std::lock_guard lock(mutex_); + + // Destroy hardware queues + for (auto& priority_pool : hw_queue_pools_) { + for (auto* hw_queue : priority_pool.second) { + if (hw_queue) { + hw_queue->Destroy(); + } + } + priority_pool.second.clear(); + } + hw_queue_pools_.clear(); + + // Clean up counted and shared queues + for (auto& cq : counted_queues_) { + // Recover SharedQueue from unique handle and free memory + hsa_queue_t* queue_handle = cq.first; + SharedQueue* shared = reinterpret_cast( + reinterpret_cast(queue_handle) - offsetof(SharedQueue, amd_queue.hsa_queue)); + delete shared; + } + counted_queues_.clear(); +} + +} // namespace core +} // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/host_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/host_queue.cpp index dcd52224d1..e88ae686cf 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/host_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/host_queue.cpp @@ -50,7 +50,7 @@ namespace core { HostQueue::HostQueue(core::SharedQueue* shared_queue, hsa_region_t region, uint32_t ring_size, hsa_queue_type32_t type, uint32_t features, hsa_signal_t doorbell_signal) - : Queue(shared_queue, 0), size_(ring_size) { + : Queue(shared_queue, 0, nullptr), size_(ring_size) { HSA::hsa_memory_register(this, sizeof(HostQueue)); MAKE_NAMED_SCOPE_GUARD(registerGuard, [&]() { HSA::hsa_memory_deregister(this, sizeof(HostQueue)); }); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp index de2de29746..49df86fbd6 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp @@ -87,7 +87,7 @@ void HsaApiTable::Init() { // they can add preprocessor macros on the new functions constexpr size_t expected_core_api_table_size = 1016; - constexpr size_t expected_amd_ext_table_size = 624; + constexpr size_t expected_amd_ext_table_size = 640; constexpr size_t expected_image_ext_table_size = 128; constexpr size_t expected_finalizer_ext_table_size = 64; constexpr size_t expected_tools_table_size = 64; @@ -476,6 +476,8 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_enable_logging_fn = AMD::hsa_amd_enable_logging; amd_ext_api.hsa_amd_ais_file_write_fn = AMD::hsa_amd_ais_file_write; amd_ext_api.hsa_amd_ais_file_read_fn = AMD::hsa_amd_ais_file_read; + amd_ext_api.hsa_amd_counted_queue_acquire_fn = AMD::hsa_amd_counted_queue_acquire; + amd_ext_api.hsa_amd_counted_queue_release_fn = AMD::hsa_amd_counted_queue_release; amd_ext_api.hsa_amd_signal_wait_all_fn = AMD::hsa_amd_signal_wait_all; amd_ext_api.hsa_amd_memory_get_preferred_copy_engine_fn = AMD::hsa_amd_memory_get_preferred_copy_engine; amd_ext_api.hsa_amd_portable_export_dmabuf_v2_fn = AMD::hsa_amd_portable_export_dmabuf_v2; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index 91af1f46aa..c812bcb907 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -62,6 +62,7 @@ #include "core/inc/ipc_signal.h" #include "core/inc/runtime.h" #include "core/inc/signal.h" +#include "core/inc/counted_queue_manager.h" namespace rocr { @@ -714,6 +715,10 @@ hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu core::Queue* cmd_queue = core::Queue::Convert(queue); IS_VALID(cmd_queue); + + // Check if this a counted queue; NACK if it is + if (cmd_queue->is_counted_queue) return HSA_STATUS_ERROR_INVALID_QUEUE; + if (num_cu_mask_count != 0) IS_BAD_PTR(cu_mask); if (num_cu_mask_count % 32 != 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT; return cmd_queue->SetCUMasking(num_cu_mask_count, cu_mask); @@ -1183,22 +1188,13 @@ hsa_status_t hsa_amd_queue_set_priority(hsa_queue_t* queue, core::Queue* cmd_queue = core::Queue::Convert(queue); IS_VALID(cmd_queue); - // Highest queue priority allowed for HSA user is HSA_QUEUE_PRIORITY_HIGH - // HSA_QUEUE_PRIORITY_MAXIMUM is reserved for PC Sampling and can only be allocated internally - // in ROCR - static std::map ext_kmt_priomap = { - {HSA_AMD_QUEUE_PRIORITY_LOW, HSA_QUEUE_PRIORITY_MINIMUM}, - {HSA_AMD_QUEUE_PRIORITY_NORMAL, HSA_QUEUE_PRIORITY_NORMAL}, - {HSA_AMD_QUEUE_PRIORITY_HIGH, HSA_QUEUE_PRIORITY_HIGH}, - }; + // Check if this a counted queue; NACK if it is + if (cmd_queue->is_counted_queue) return HSA_STATUS_ERROR_INVALID_QUEUE; - auto priority_it = ext_kmt_priomap.find(priority); - - if (priority_it == ext_kmt_priomap.end()) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - return cmd_queue->SetPriority(priority_it->second); + // Convert to ROCR internal priority type + HSA::hsa_amd_queue_priority_internal_t priority_ = static_cast(priority); + + return cmd_queue->SetPriority(priority_); CATCH; } @@ -1532,7 +1528,7 @@ hsa_status_t HSA_API hsa_amd_queue_get_info(hsa_queue_t* _queue, core::Queue* queue = core::Queue::Convert(_queue); IS_VALID(queue); - + return queue->GetInfo(attribute, value); CATCH; } @@ -1576,6 +1572,72 @@ hsa_status_t hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, void *devic CATCH; } +hsa_status_t HSA_API +hsa_amd_counted_queue_acquire(hsa_agent_t agent, + hsa_queue_type_t type, + hsa_amd_queue_priority_t priority, + void (*callback)(hsa_status_t status, + hsa_queue_t* source, + void* data), + void* data, + uint64_t flags, + hsa_queue_t** queue) { + TRY; + IS_OPEN(); + // Basic validation + if (queue == nullptr) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + // Check priority + if (priority < HSA_AMD_QUEUE_PRIORITY_LOW || priority > HSA_AMD_QUEUE_PRIORITY_HIGH) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + // Only support multi-producer queues + if (type != HSA_QUEUE_TYPE_MULTI) { + return HSA_STATUS_ERROR_INVALID_QUEUE_CREATION; + } + + // Convert handle to internal agent + core::Agent* core_agent = core::Agent::Convert(agent); + IS_VALID(core_agent); + if (core_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + AMD::GpuAgent* gpu_agent = static_cast(core_agent); + + // Convert to ROCR internal priority type + HSA::hsa_amd_queue_priority_internal_t priority_ = static_cast(priority); + + // Call the queue pool manager + return gpu_agent->AcquireCountedQueue(type, priority_, callback, data, flags, queue); + CATCH; +} + +hsa_status_t HSA_API +hsa_amd_counted_queue_release(hsa_queue_t* queue) { + TRY; + IS_OPEN(); + // Basic validation + if (queue == nullptr) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + core::Queue* core_queue = core::Queue::Convert(queue); + IS_VALID(core_queue); + + core::Agent* core_agent = core_queue->GetAgent(); + IS_VALID(core_agent); + if (core_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + AMD::GpuAgent* gpu_agent = static_cast(core_agent); + + return gpu_agent->ReleaseCountedQueue(queue); + CATCH; +} + hsa_status_t hsa_amd_enable_logging(uint8_t* flags, void *file) { TRY; return core::Runtime::runtime_singleton_->EnableLogging(flags, file); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp index d1eef19208..939a2093a7 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp @@ -438,7 +438,9 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) { hsa_status_t InterceptQueue::GetInfo(hsa_queue_info_attribute_t attribute, void* value) { switch (attribute) { case HSA_AMD_QUEUE_INFO_AGENT: - case HSA_AMD_QUEUE_INFO_DOORBELL_ID: { + case HSA_AMD_QUEUE_INFO_DOORBELL_ID: + case HSA_QUEUE_INFO_USE_COUNT: + case HSA_QUEUE_INFO_HW_ID: { if (!AMD::AqlQueue::IsType(wrapped.get())) return HSA_STATUS_ERROR_INVALID_QUEUE; AMD::AqlQueue* aqlQueue = static_cast(wrapped.get()); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h index 949d689904..4a283b9e12 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h @@ -306,6 +306,8 @@ class Flag { core_dump_disable_ = (var == "1"); core_dump_pattern_ = os::GetEnvVar("HSA_COREDUMP_PATTERN"); + var = os::GetEnvVar("GPU_MAX_HW_QUEUES"); + cp_queues_limit_ = var.empty() ? 4 : atoi(var.c_str()); } void parse_masks(uint32_t maxGpu, uint32_t maxCU) { @@ -426,6 +428,8 @@ class Flag { size_t co_dmacopy_size() const { return co_dmacopy_size_; } + uint32_t cp_queues_limit() const { return cp_queues_limit_; } + bool dev_mem_queue_buf() const { return dev_mem_queue_buf_; } uint32_t signal_abort_timeout() const { return signal_abort_timeout_; } @@ -545,6 +549,8 @@ class Flag { bool enable_core_dump_progress_ = false; std::string core_dump_pattern_; + uint32_t cp_queues_limit_; + // Map GPU index post RVD to its default cu mask. std::map> cu_mask_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def index 0c06531b7b..c81f96b402 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def +++ b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def @@ -266,6 +266,8 @@ global: hsa_ext_image_create_v2; hsa_ext_image_data_get_info_v2; hsa_ext_image_destroy_v2; + hsa_amd_counted_queue_acquire; + hsa_amd_counted_queue_release; local: *; }; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h index cc33320269..c2335244c8 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h @@ -273,6 +273,8 @@ struct AmdExtTable { decltype(hsa_amd_portable_export_dmabuf_v2)* hsa_amd_portable_export_dmabuf_v2_fn; decltype(hsa_amd_ais_file_write)* hsa_amd_ais_file_write_fn; decltype(hsa_amd_ais_file_read)* hsa_amd_ais_file_read_fn; + decltype(hsa_amd_counted_queue_acquire)* hsa_amd_counted_queue_acquire_fn; + decltype(hsa_amd_counted_queue_release)* hsa_amd_counted_queue_release_fn; }; // Table to export HSA Core Runtime Apis diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace_version.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace_version.h index 6cf1054823..19e3461b59 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace_version.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace_version.h @@ -58,7 +58,7 @@ // Step Ids of the Api tables exported by Hsa Core Runtime #define HSA_API_TABLE_STEP_VERSION 0x01 #define HSA_CORE_API_TABLE_STEP_VERSION 0x00 -#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x08 +#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x09 #define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00 #define HSA_IMAGE_API_TABLE_STEP_VERSION 0x01 // Rocprofiler just checks HSA_MAGE_EXT_API_TABLE_STEP_VERSION diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h index 1b51d5bdb6..bcf9a77021 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -68,7 +68,7 @@ * - 1.15 - hsa_amd_register_system_event_handler: HSA_AMD_SYSTEM_SHUTDOWN */ #define HSA_AMD_INTERFACE_VERSION_MAJOR 1 -#define HSA_AMD_INTERFACE_VERSION_MINOR 15 +#define HSA_AMD_INTERFACE_VERSION_MINOR 16 #ifdef __cplusplus extern "C" { @@ -3666,6 +3666,18 @@ typedef enum { * The type of this attribute is uint64_t. */ HSA_AMD_QUEUE_INFO_DOORBELL_ID, + /* + * Returns how many times the underlying hardware queue has been shared. + * @p value will be set to -1 if this queue was not allocated using + * hsa_amd_counted_queue_acquire. The type of this attribute is uint32_t. + */ + HSA_QUEUE_INFO_USE_COUNT, + /* + * Returns a unique ID representing the HW resource used by a counted queue. Two queues + * with the same HW_ID use the same underlying hardware queue. This query can be + * used on counted and non-counted queues. The type of this attribute is uint32_t. + */ + HSA_QUEUE_INFO_HW_ID, } hsa_queue_info_attribute_t; hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute, @@ -3760,6 +3772,81 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi uint64_t size, int64_t file_offset, uint64_t *size_copied, int32_t *status); +/** + * @brief Create a queue that is limited by the GPU_MAX_HW_QUEUES environment variable. + * + * This underlying hardware queue returned by this function may be shared by other queues. For + * each priority this function will create a new hardware queue as long as the number of queues + * is less than GPU_MAX_HW_QUEUES. Once the GPU_MAX_HW_QUEUES limit is reached, this function will + * stop creating new hardware queues and return a reference to an existing queue of the requested + * @p priority instead. Each successful call will return a different @p queue handle. + * The hsa_amd_queue_get_info API can be used to determine whether this queue is currently shared. + * + * When there are multiple eligible hardware queues available, the queue with the lowest + * HSA_QUEUE_INFO_USE_COUNT will be returned. + * + * For each successful call, hsa_amd_counted_queue_release should be called to release the + * HSA_QUEUE_INFO_USE_COUNT. After release, the queue handle becomes invalid and must not be used. + * + * hsa_amd_queue_set_priority and hsa_amd_queue_cu_set_mask cannot be used on counted queues. + * + * @param[in] agent Agent where to create the queue + * + * @param[in] type For future use. HSA_QUEUE_TYPE_MULTI is the only valid option. + * + * @param[in] priority Associated priority. The GPU_MAX_HW_QUEUES limit is counted for each priority + * + * @param[in] callback callback invoked by the HSA runtime for every asynchronous event related to + * the newly created queue. May be NULL. The HSA runtime passes three arguments to the callback: a + * code identifying the event that triggered the invocation, a pointer to the queue where the event + * originated, and the application data. + * + * @param[in] data Application data that is passed to @p callback on every iteration. May be NULL. + * + * @param[in] flags For future use. Ignored. + * + * @param[out] queue Unique handle to reference the newly created queue. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is failure to allocate the resources required + * by the implementation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid or not a GPU agent. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p type is not HSA_QUEUE_TYPE_MULTI. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid priority or NULL queue pointer. + */ +hsa_status_t HSA_API hsa_amd_counted_queue_acquire(hsa_agent_t agent, hsa_queue_type_t type, + hsa_amd_queue_priority_t priority, + void (*callback)(hsa_status_t status, + hsa_queue_t* source, void* data), + void* data, uint64_t flags, hsa_queue_t** queue); + +/** + * @brief Release a counted queue and decrements its use count. + * + * Releases a queue that was previously acquired using hsa_amd_counted_queue_acquire. + * Each call to this API decrements the internal use count HSA_QUEUE_INFO_USE_COUNT + * of the underlying hardware. After this call, queue handle is invalid and must not be used. + * Once created, the hardware queue is retained until hsa_shutdown is called to avoid costly + * overhead of repeatedly creating new hardware queues, allowing them to be reused. + * + * + * @param[in] queue Counted queue handle returned from hsa_amd_counted_queue_acquire. + * Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * @retval ::HSA_STATUS_ERROR Invalid queue or queue was already released. + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized. + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The @p queue is NULL or previously released. + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The queue's agent is invalid or not a GPU agent. + */ +hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue); + /** * @brief logging types */