SWDEV-561708 Initial shared queue pool apis (#1614)

* SWDEV-561708 Initial shared queue pool apis * Validate params; some fixes in callback function (but still needs to be checked) * Dtor cleanup * minor * Enable profiling; remove callback since aql_queue takes care of it * setPriority and setCuMask APIs updated for counted queues * Increasing step and minor version for rocprofiler * Tests for CountedQueueManager * tests * Code refactored to make pool manager part of GpuAgent only (incomplete); unique handles issue pending * Refactored code to support CQM inside GpuAgent and unique handles; multithreaded test added * Changed to ASSERT_SUCCESS macros for all tests * RIng buffer overflow test added * tests fixed; cleanup added at hsa_shutdown * priority conversion table changes * Compiler warnings fixed * Rewrite 1 test; add desc and improve SetUp() code * Improvement * Unififed getinfo for both counted and non-counted queues * Address PR feedback * Addressing feedback: memleak, data type mismatch, documentation * improve comment * format * Missing HSA_API macros for roctracer * Revert "Addressing feedback: memleak, data type mismatch, documentation" This reverts commit 5e498a55fb3640e00d06cec63dcec79293fb23de. * Improving acquire api doc * release api doc improved * error codes for release api doc
2026-01-21 15:30:04 -05:00
parent f1b313780b
commit 793755532f
36 changed files with 1545 additions and 58 deletions
@@ -0,0 +1,855 @@
+/*
+ * Copyright © Advanced Micro Devices, Inc., or its affiliates.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <thread>
+#include <mutex>
+#include <atomic>
+#include <unordered_map>
+#include <algorithm>
+
+#include "suites/functional/counted_queues.h"
+#include "hsa/hsa_ext_amd.h"
+#include "hsa/hsa.h"
+#include "common/base_rocr_utils.h"
+#include "gtest/gtest.h"
+#include "common/os.h"
+
+
+static bool VerifyResult(uint32_t* ar, size_t sz) {
+  for (size_t i = 0; i < sz; ++i) {
+    if (i * i != ar[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+CountedQueuesTest::CountedQueuesTest() : TestBase() {
+  set_title("RocR Counted Queues Test");
+  set_description(
+      "This test validates the behavior of Shared Counted Queues managed by the "
+      "Counted Queue Manager in a scenario where different libraries use CP "
+      "Queues and it avoids oversubscription and a subsequent performance degradation.");
+}
+
+CountedQueuesTest::~CountedQueuesTest() {}
+
+void CountedQueuesTest::SetUp() {
+  const std::string kDefaultLimit = "2";
+  static const std::unordered_map<std::string, std::string> kQueueLimits = {
+      {"Counted_Queue_Multithreaded_Dispatch_Test", "1"},
+      {"Counted_Queue_Overflow_And_Wraparound_Test", "1"},
+      {"Counted_Queue_Same_Priority_Max_Limit_Test", "4"}};
+
+  const ::testing::TestInfo* test_info = ::testing::UnitTest::GetInstance()->current_test_info();
+  if (test_info) {
+    const std::string test_name = test_info->name();
+
+    // Find the current test's required limit from map and set the env var
+    // Set default HW queue limit if not found in map 
+    auto it = kQueueLimits.find(test_name);
+    const std::string& limit = (it != kQueueLimits.end()) ? it->second : kDefaultLimit;
+    rocrtst::SetEnv("GPU_MAX_HW_QUEUES", limit.c_str());
+  }
+
+  TestBase::SetUp();
+}
+
+void CountedQueuesTest::Run() {
+  // Compare required profile for this test case with what we're actually
+  // running on
+  if (!rocrtst::CheckProfile(this)) {
+    return;
+  }
+  TestBase::Run();
+}
+
+void CountedQueuesTest::Close() {
+  // This will close handles opened within rocrtst utility calls and call
+  // hsa_shut_down(), so it should be done after other hsa cleanup
+  TestBase::Close();
+}
+
+void CountedQueuesTest::DisplayResults() const {
+  // Compare required profile for this test case with what we're actually
+  // running on
+  if (!rocrtst::CheckProfile(this)) {
+    return;
+  }
+  TestBase::DisplayResults();
+}
+
+void CountedQueuesTest::DisplayTestInfo() { TestBase::DisplayTestInfo(); }
+
+void CountedQueuesTest::CountedQueueBasicApiTest() {
+  // Find all gpu agents
+  std::vector<hsa_agent_t> gpus;
+  ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
+
+  hsa_queue_t* queue = nullptr;
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
+      gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, &queue));
+  ASSERT_NE(queue, nullptr);
+
+  // Query counted queue and check internal reference count
+  int32_t use_count = 0;
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count));
+  EXPECT_EQ(use_count, 1);  // should be 1 after acquire
+
+  // Release the queue
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
+
+  // Check that ref count is back to 0 after release
+  hsa_status_t status;
+  status = hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count);
+  ASSERT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
+}
+
+void CountedQueuesTest::CountedQueues_SamePriority_MaxLimitTest() {
+  hsa_status_t status;
+
+  // Find all GPU agents
+  std::vector<hsa_agent_t> gpus;
+  ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
+  ASSERT_FALSE(gpus.empty());
+
+  const int NUM_QUEUES = 50;
+  const int MAX_HW_QUEUES = std::stoi(rocrtst::GetEnv("GPU_MAX_HW_QUEUES"));
+
+  std::vector<hsa_queue_t*> queues(NUM_QUEUES, nullptr);
+  std::vector<uint32_t> hw_ids(NUM_QUEUES, 0);
+
+  // Acquire NUM_QUEUES counted queues
+  for (int i = 0; i < NUM_QUEUES; i++) {
+    ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI,
+                                                 HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
+                                                 &queues[i]));
+    ASSERT_NE(queues[i], nullptr);
+  }
+
+  // Query HW IDs
+  for (int i = 0; i < NUM_QUEUES; i++) {
+    ASSERT_SUCCESS(hsa_amd_queue_get_info(queues[i], HSA_QUEUE_INFO_HW_ID, &hw_ids[i]));
+  }
+
+  // Sort and remove duplicate HW IDs
+  std::sort(hw_ids.begin(), hw_ids.end());
+  auto it = std::unique(hw_ids.begin(), hw_ids.end());
+  hw_ids.resize(std::distance(hw_ids.begin(), it));
+
+  // Ensure hardware queue count matches MAX_HW_QUEUES
+  ASSERT_EQ(hw_ids.size(), MAX_HW_QUEUES);
+
+  // Verify even distribution of logical queues over HW queues
+  // Map HW ID -> use count
+  std::unordered_map<uint32_t, uint32_t> use_counts;
+
+  for (auto* q : queues) {
+    uint32_t hwid = 0, count = 0;
+    ASSERT_SUCCESS(hsa_amd_queue_get_info(q, HSA_QUEUE_INFO_HW_ID, &hwid));
+    ASSERT_SUCCESS(hsa_amd_queue_get_info(q, HSA_QUEUE_INFO_USE_COUNT, &count));
+    use_counts[hwid] = count;  // overwrites but counts are per-hw, same across queues
+  }
+
+  // Gather all use-counts for fairness check
+  std::vector<uint32_t> dist;
+  for (auto& kv : use_counts) {
+    dist.push_back(kv.second);
+  }
+
+  ASSERT_EQ(dist.size(), MAX_HW_QUEUES);
+
+  // Fair distribution: difference should not exceed 1
+  auto [min_it, max_it] = std::minmax_element(dist.begin(), dist.end());
+  EXPECT_LE(*max_it - *min_it, 1);
+
+  // Release queues
+  for (auto* q : queues) {
+    ASSERT_SUCCESS(hsa_amd_counted_queue_release(q));
+  }
+
+  // After release, querying use-count should return invalid argument
+  for (auto* q : queues) {
+    uint32_t tmp = 0;
+    EXPECT_EQ(hsa_amd_queue_get_info(q, HSA_QUEUE_INFO_USE_COUNT, &tmp),
+              HSA_STATUS_ERROR_INVALID_ARGUMENT);
+  }
+}
+
+void CountedQueuesTest::InvalidArgsTest() {
+  hsa_status_t status;
+  hsa_queue_t* q = nullptr;
+
+  // Find all gpu agents
+  std::vector<hsa_agent_t> gpus;
+  ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
+
+  // Invalid queue pointer
+  status = hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW,
+                                         nullptr, nullptr, 0, nullptr);
+  EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
+
+  // Invalid priority
+  const hsa_amd_queue_priority_t invalid_priority = static_cast<hsa_amd_queue_priority_t>(999);
+  status = hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI, invalid_priority, nullptr,
+                                         nullptr, 0, &q);
+  EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
+
+  // Support multi producer queues only
+  status = hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_SINGLE, HSA_AMD_QUEUE_PRIORITY_LOW,
+                                         nullptr, nullptr, 0, &q);
+  EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_QUEUE_CREATION);
+
+  // Check release API params
+  hsa_queue_t* queue = nullptr;
+  status = hsa_amd_counted_queue_release(queue);
+  EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
+}
+
+void CountedQueuesTest::CountedQueuesAllPrioritiesLimitTest() {
+  hsa_status_t status;
+
+  // Find all gpu agents
+  std::vector<hsa_agent_t> gpus;
+  ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
+
+  // Acquire 2 queues per priority (total 6 queues)
+  hsa_queue_t *low1 = nullptr, *low2 = nullptr, *low3 = nullptr;
+  hsa_queue_t *normal1 = nullptr, *normal2 = nullptr, *normal3 = nullptr;
+  hsa_queue_t *high1 = nullptr, *high2 = nullptr, *high3 = nullptr;
+
+  // Low Priority
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
+      gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &low1));
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
+      gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &low2));
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI,
+                                               HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
+                                               &low3));  // should reuse low1
+
+  // Normal Priority
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
+      gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, &normal1));
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
+      gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, &normal2));
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI,
+                                               HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0,
+                                               &normal3));  // should reuse normal1
+
+  // High Priority
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
+      gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_HIGH, nullptr, nullptr, 0, &high1));
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
+      gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_HIGH, nullptr, nullptr, 0, &high2));
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
+      gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_HIGH, nullptr, nullptr, 0, &high3));
+
+  // Verify reuse and independence per priority
+  uint32_t low_id1 = 0, low_id2 = 0, low_id3 = 0;
+  uint32_t norm_id1 = 0, norm_id2 = 0, norm_id3 = 0;
+  uint32_t high_id1 = 0, high_id2 = 0, high_id3 = 0;
+
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(low1, HSA_QUEUE_INFO_HW_ID, &low_id1));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(low2, HSA_QUEUE_INFO_HW_ID, &low_id2));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(low3, HSA_QUEUE_INFO_HW_ID, &low_id3));
+
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(normal1, HSA_QUEUE_INFO_HW_ID, &norm_id1));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(normal2, HSA_QUEUE_INFO_HW_ID, &norm_id2));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(normal3, HSA_QUEUE_INFO_HW_ID, &norm_id3));
+
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(high1, HSA_QUEUE_INFO_HW_ID, &high_id1));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(high2, HSA_QUEUE_INFO_HW_ID, &high_id2));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(high3, HSA_QUEUE_INFO_HW_ID, &high_id3));
+
+  // Within same priority: max 2 unique HW queues
+  EXPECT_NE(low_id1, low_id2);
+  EXPECT_TRUE(low_id3 == low_id1);
+
+  EXPECT_NE(norm_id1, norm_id2);
+  EXPECT_TRUE(norm_id3 == norm_id1);
+
+  EXPECT_NE(high_id1, high_id2);
+  EXPECT_TRUE(high_id3 == high_id1);
+
+  // Ensure different queues are used across priorities
+  EXPECT_NE(low_id1, norm_id1);
+  EXPECT_NE(norm_id1, high_id1);
+  EXPECT_NE(low_id1, high_id1);
+
+  // Verify use counts of first two HW queues
+  uint32_t low_use1 = 0, low_use2 = 0, low_use3 = 0;
+  uint32_t norm_use1 = 0, norm_use2 = 0, norm_use3 = 0;
+  uint32_t high_use1 = 0, high_use2 = 0, high_use3 = 0;
+
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(low1, HSA_QUEUE_INFO_USE_COUNT, &low_use1));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(low2, HSA_QUEUE_INFO_USE_COUNT, &low_use2));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(low3, HSA_QUEUE_INFO_USE_COUNT, &low_use3));
+
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(normal1, HSA_QUEUE_INFO_USE_COUNT, &norm_use1));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(normal2, HSA_QUEUE_INFO_USE_COUNT, &norm_use2));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(normal3, HSA_QUEUE_INFO_USE_COUNT, &norm_use3));
+
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(high1, HSA_QUEUE_INFO_USE_COUNT, &high_use1));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(high2, HSA_QUEUE_INFO_USE_COUNT, &high_use2));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(high3, HSA_QUEUE_INFO_USE_COUNT, &high_use3));
+
+  EXPECT_EQ(low_use1, 2);
+  EXPECT_EQ(low_use2, 1);
+  EXPECT_TRUE(low_use1 == low_use3);  // same HW queues, same ref count
+
+  EXPECT_EQ(norm_use1, 2);
+  EXPECT_EQ(norm_use2, 1);
+  EXPECT_TRUE(norm_use1 == norm_use3);
+
+  EXPECT_EQ(high_use1, 2);
+  EXPECT_EQ(high_use2, 1);
+  EXPECT_TRUE(high_use1 == high_use3);
+
+  // Release all queues
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(low1));
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(low2));
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(low3));
+
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(normal1));
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(normal2));
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(normal3));
+
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(high1));
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(high2));
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(high3));
+}
+
+void CountedQueuesTest::CountedQueuesSetPriorityNackTest() {
+  hsa_status_t status;
+
+  // Find all gpu agents
+  std::vector<hsa_agent_t> gpus;
+  ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
+
+  // Create a counted queue
+  hsa_queue_t* queue = nullptr;
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
+      gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &queue));
+  EXPECT_NE(queue, nullptr);
+
+  // Try to set priority on this queue; should fail
+  status = hsa_amd_queue_set_priority(queue, HSA_AMD_QUEUE_PRIORITY_HIGH);
+  EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_QUEUE);
+
+  // release queue
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
+}
+
+void CountedQueuesTest::CountedQueuesSetCUMaskNackTest() {
+  hsa_status_t status;
+
+  // Find all gpu agents
+  std::vector<hsa_agent_t> gpus;
+  ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
+
+  // Create a counted queue
+  hsa_queue_t* queue = nullptr;
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
+      gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &queue));
+  EXPECT_NE(queue, nullptr);
+
+  // Attempt to set CU mask on counted queue; should fail
+  uint32_t cu_mask[32] = {0};  // dummy mask
+  status = hsa_amd_queue_cu_set_mask(queue, 1, cu_mask);
+  EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_QUEUE);
+
+  // release queue
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
+}
+
+void CountedQueuesTest::CountedQueuesDispatchTest() {
+  hsa_status_t status;
+
+  // Common setup
+  ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this));
+  ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this));
+
+  // Load kernel
+  set_kernel_file_name("test_case_template_kernels.hsaco");
+  set_kernel_name("square");
+  ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, gpu_device1()));
+
+  hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()};
+
+  // Allocate source buffer
+  void* src_buffer = nullptr;
+  ASSERT_SUCCESS(hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &src_buffer));
+  ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, src_buffer));
+
+  // Initialize source data
+  for (uint32_t i = 0; i < 256; ++i) {
+    reinterpret_cast<uint32_t*>(src_buffer)[i] = i;
+  }
+
+  // Allocate destination buffer
+  void* dst_buffer = nullptr;
+  ASSERT_SUCCESS(hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &dst_buffer));
+  ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, dst_buffer));
+
+  // Create completion signal
+  hsa_signal_t completion_signal;
+  ASSERT_SUCCESS(hsa_signal_create(1, 0, nullptr, &completion_signal));
+
+  // Get a counted queue
+  hsa_queue_t* queue = nullptr;
+  ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(*gpu_device1(), HSA_QUEUE_TYPE_MULTI,
+                                               HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
+                                               &queue));
+  EXPECT_NE(queue, nullptr);
+
+  // Query queue info
+  int32_t use_count = 0;
+  uint32_t hw_id = 0;
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count));
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_HW_ID, &hw_id));
+  EXPECT_EQ(use_count, 1);
+
+  // Prepare kernel arguments
+  struct __attribute__((aligned(16))) local_args_t {
+    uint32_t* dstArray;
+    uint32_t* srcArray;
+    uint32_t size;
+    uint32_t pad;
+    uint64_t global_offset_x;
+    uint64_t global_offset_y;
+    uint64_t global_offset_z;
+    uint64_t printf_buffer;
+    uint64_t default_queue;
+    uint64_t completion_action;
+  } local_args;
+
+  local_args.dstArray = reinterpret_cast<uint32_t*>(dst_buffer);
+  local_args.srcArray = reinterpret_cast<uint32_t*>(src_buffer);
+  local_args.size = 256;
+  local_args.global_offset_x = 0;
+  local_args.global_offset_y = 0;
+  local_args.global_offset_z = 0;
+  local_args.printf_buffer = 0;
+  local_args.default_queue = 0;
+  local_args.completion_action = 0;
+
+  // Allocate kernel arguments
+  void* kernarg_address = nullptr;
+  ASSERT_SUCCESS(
+      hsa_amd_memory_pool_allocate(kern_arg_pool(), sizeof(local_args), 0, &kernarg_address));
+  ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, kernarg_address));
+  memcpy(kernarg_address, &local_args, sizeof(local_args));
+
+  // Dispatch loop
+  int it = num_iteration() * 5;
+  const uint32_t queue_mask = queue->size - 1;
+
+  for (int i = 0; i < it; i++) {
+    // Reserve a slot in the queue
+    uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1);
+
+    // Get pointer to the reserved packet slot
+    hsa_kernel_dispatch_packet_t* queue_aql_packet =
+        &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address))[index & queue_mask];
+
+    // Fill packet fields
+    queue_aql_packet->setup = 1;
+    queue_aql_packet->workgroup_size_x = 256;
+    queue_aql_packet->workgroup_size_y = 1;
+    queue_aql_packet->workgroup_size_z = 1;
+    queue_aql_packet->grid_size_x = 256;
+    queue_aql_packet->grid_size_y = 1;
+    queue_aql_packet->grid_size_z = 1;
+    queue_aql_packet->private_segment_size = 0;
+    queue_aql_packet->group_segment_size = 0;
+    queue_aql_packet->kernel_object = kernel_object();
+    queue_aql_packet->kernarg_address = kernarg_address;
+    queue_aql_packet->completion_signal = completion_signal;
+
+    // Write header for packet
+    uint32_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
+    header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+    header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+    __atomic_store_n(reinterpret_cast<uint16_t*>(&queue_aql_packet->header), header,
+                     __ATOMIC_RELEASE);
+
+    // Ring doorbell to notify GPU
+    hsa_signal_store_screlease(queue->doorbell_signal, index);
+
+    // Wait for completion signal
+    while (hsa_signal_wait_scacquire(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
+                                     HSA_WAIT_STATE_ACTIVE)) {
+    }
+
+    // Reset signal for next iteration
+    hsa_signal_store_screlease(completion_signal, 1);
+
+    // Verify results
+    ASSERT_TRUE(VerifyResult(reinterpret_cast<uint32_t*>(dst_buffer), 256));
+  }
+
+  // Verify use count before release
+  ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count));
+  EXPECT_EQ(use_count, 1);
+
+  // Release the counted queue
+  ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
+
+  // Verify queue info returns error after release
+  status = hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count);
+  ASSERT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
+
+  // Cleanup
+  ASSERT_SUCCESS(hsa_amd_memory_pool_free(kernarg_address));
+  ASSERT_SUCCESS(hsa_signal_destroy(completion_signal));
+  ASSERT_SUCCESS(hsa_amd_memory_pool_free(src_buffer));
+  ASSERT_SUCCESS(hsa_amd_memory_pool_free(dst_buffer));
+}
+
+void CountedQueuesTest::CountedQueuesMultithreadedDispatchTest() {
+  hsa_status_t status;
+
+  // Common setup
+  ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this));
+  ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this));
+
+  // Load kernel
+  set_kernel_file_name("test_case_template_kernels.hsaco");
+  set_kernel_name("square");
+  ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, gpu_device1()));
+
+  hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()};
+
+  // Shared source buffer (read-only)
+  void* shared_src_buffer = nullptr;
+  ASSERT_SUCCESS(
+      hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &shared_src_buffer));
+  ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, shared_src_buffer));
+
+  // Initialize source data
+  for (uint32_t i = 0; i < 256; ++i) {
+    reinterpret_cast<uint32_t*>(shared_src_buffer)[i] = i;
+  }
+
+  // Structures for validation later on
+  std::mutex hwIdsMutex;
+  std::vector<uint32_t> allHwIds;
+  std::atomic<int32_t> maxUseCount{0};
+
+  auto func = [&]() {
+    // local dest buffer for each user application
+    void* local_dst_buffer = nullptr;
+    ASSERT_SUCCESS(
+        hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &local_dst_buffer));
+    ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, local_dst_buffer));
+
+    // Local completion signal for every user application
+    hsa_signal_t local_signal;
+    ASSERT_SUCCESS(hsa_signal_create(1, 0, nullptr, &local_signal));
+
+    // Get a counted queue
+    hsa_queue_t* queue = nullptr;
+    ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(*gpu_device1(), HSA_QUEUE_TYPE_MULTI,
+                                                 HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
+                                                 &queue));
+    EXPECT_NE(queue, nullptr);
+
+    if (queue == nullptr) {
+      hsa_signal_destroy(local_signal);
+      hsa_amd_memory_pool_free(local_dst_buffer);
+      return;
+    }
+
+    // Store query results for later analysis
+    int32_t localUseCount = 0;
+    uint32_t localHwId = 0;
+
+    ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &localUseCount));
+    ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_HW_ID, &localHwId));
+
+    // Update use_count if it is larger than previous value
+    int expected = maxUseCount.load();
+    while (localUseCount > expected &&
+           !maxUseCount.compare_exchange_weak(expected, localUseCount)) {
+    }
+
+    // Store hw id for validation later on
+    {
+      std::lock_guard<std::mutex> lock(hwIdsMutex);
+      allHwIds.push_back(localHwId);
+    }
+
+    struct __attribute__((aligned(16))) local_args_t {
+      uint32_t* dstArray;
+      uint32_t* srcArray;
+      uint32_t size;
+      uint32_t pad;
+      uint64_t global_offset_x;
+      uint64_t global_offset_y;
+      uint64_t global_offset_z;
+      uint64_t printf_buffer;
+      uint64_t default_queue;
+      uint64_t completion_action;
+    } local_args;
+
+    local_args.dstArray = reinterpret_cast<uint32_t*>(local_dst_buffer);
+    local_args.srcArray = reinterpret_cast<uint32_t*>(shared_src_buffer);
+    local_args.size = 256;
+    local_args.global_offset_x = 0;
+    local_args.global_offset_y = 0;
+    local_args.global_offset_z = 0;
+    local_args.printf_buffer = 0;
+    local_args.default_queue = 0;
+    local_args.completion_action = 0;
+
+    void* kernarg_address = nullptr;
+    ASSERT_SUCCESS(
+        hsa_amd_memory_pool_allocate(kern_arg_pool(), sizeof(local_args), 0, &kernarg_address));
+    ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, kernarg_address));
+
+    memcpy(kernarg_address, &local_args, sizeof(local_args));
+
+    // Dispatch loop
+    int it = num_iteration() * 5;
+    const uint32_t queue_mask = queue->size - 1;
+
+    for (int i = 0; i < it; i++) {
+      // Reserve a slot in the queue
+      uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1);
+
+      // Get pointer to the reserved packet slot and validate address
+      hsa_kernel_dispatch_packet_t* queue_aql_packet = &(
+          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address))[index & queue_mask];
+      ASSERT_EQ(queue_aql_packet,
+                reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address) + (index & queue_mask));
+
+      // Fill packet fields
+      queue_aql_packet->setup = 1;
+      queue_aql_packet->workgroup_size_x = 256;
+      queue_aql_packet->workgroup_size_y = 1;
+      queue_aql_packet->workgroup_size_z = 1;
+      queue_aql_packet->grid_size_x = 256;
+      queue_aql_packet->grid_size_y = 1;
+      queue_aql_packet->grid_size_z = 1;
+      queue_aql_packet->private_segment_size = 0;
+      queue_aql_packet->group_segment_size = 0;
+      queue_aql_packet->kernel_object = kernel_object();
+      queue_aql_packet->kernarg_address = kernarg_address;
+      queue_aql_packet->completion_signal = local_signal;
+
+      // Write header for packet
+      uint32_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
+      header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+      header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+      __atomic_store_n(reinterpret_cast<uint16_t*>(&queue_aql_packet->header), header,
+                       __ATOMIC_RELEASE);
+
+      // Ring doorbell to notify GPU
+      hsa_signal_store_screlease(queue->doorbell_signal, index);
+
+      // Wait for completion signal to be less than 1
+      while (hsa_signal_wait_scacquire(local_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
+                                       HSA_WAIT_STATE_ACTIVE)) {
+      }
+
+      // Reset signal for next iteration
+      hsa_signal_store_screlease(local_signal, 1);
+
+      ASSERT_TRUE(VerifyResult(reinterpret_cast<uint32_t*>(local_dst_buffer), 256));
+    }
+
+    // Cleanup
+    hsa_amd_memory_pool_free(kernarg_address);
+    hsa_signal_destroy(local_signal);
+    hsa_amd_memory_pool_free(local_dst_buffer);
+
+    // Release the counted queue
+    ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
+  };
+
+  constexpr int kThreads = 2;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < kThreads; i++) {
+    threads.emplace_back(func);
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  // With GPU_MAX_HW_QUEUES=1, all threads should share the same HW queue
+  // Check if largest useCount is same as the number of user apps accessing queues
+  EXPECT_EQ(maxUseCount.load(), kThreads);
+
+  // All HW IDs should be the same (only 1 HW queue created)
+  EXPECT_EQ(allHwIds.size(), static_cast<size_t>(kThreads));
+  for (size_t i = 1; i < allHwIds.size(); i++) {
+    EXPECT_EQ(allHwIds[i], allHwIds[0]);
+  }
+
+  hsa_amd_memory_pool_free(shared_src_buffer);
+}
+
+void CountedQueuesTest::CountedQueuesOverflowWrapAroundTest() {
+  hsa_status_t status;
+
+  // Common setup
+  ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this));
+  ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this));
+
+  // Load kernel
+  set_kernel_file_name("test_case_template_kernels.hsaco");
+  set_kernel_name("square");
+  ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, gpu_device1()));
+
+  hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()};
+
+  void* shared_src_buffer = nullptr;
+  ASSERT_SUCCESS(
+      hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &shared_src_buffer));
+  ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, shared_src_buffer));
+
+  for (uint32_t i = 0; i < 256; ++i) {
+    reinterpret_cast<uint32_t*>(shared_src_buffer)[i] = i;
+  }
+
+  // To verify that after the queue has been used up, next index wraps around
+  std::atomic<uint64_t> maxIndexSeen{0};
+
+  auto func = [&]() {
+    // local dest buffer for each user application
+    void* local_dst_buffer = nullptr;
+    ASSERT_SUCCESS(
+        hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &local_dst_buffer));
+    ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, local_dst_buffer));
+
+    // Local completion signal for every user application
+    hsa_signal_t local_signal;
+    ASSERT_SUCCESS(hsa_signal_create(1, 0, nullptr, &local_signal));
+
+    // Get a counted queue
+    hsa_queue_t* queue = nullptr;
+    ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(*gpu_device1(), HSA_QUEUE_TYPE_MULTI,
+                                                 HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
+                                                 &queue));
+    EXPECT_NE(queue, nullptr);
+
+    if (queue == nullptr) {
+      hsa_signal_destroy(local_signal);
+      hsa_amd_memory_pool_free(local_dst_buffer);
+      return;
+    }
+
+    uint32_t queue_size = queue->size;           // should be 16384
+    const uint32_t queue_mask = queue_size - 1;  // used for index wraparound
+
+    struct __attribute__((aligned(16))) local_args_t {
+      uint32_t* dstArray;
+      uint32_t* srcArray;
+      uint32_t size;
+      uint32_t pad;
+      uint64_t global_offset_x;
+      uint64_t global_offset_y;
+      uint64_t global_offset_z;
+      uint64_t printf_buffer;
+      uint64_t default_queue;
+      uint64_t completion_action;
+    } local_args;
+
+    local_args.dstArray = reinterpret_cast<uint32_t*>(local_dst_buffer);
+    local_args.srcArray = reinterpret_cast<uint32_t*>(shared_src_buffer);
+    local_args.size = 256;
+    local_args.global_offset_x = 0;
+    local_args.global_offset_y = 0;
+    local_args.global_offset_z = 0;
+    local_args.printf_buffer = 0;
+    local_args.default_queue = 0;
+    local_args.completion_action = 0;
+
+    void* kernarg_address = nullptr;
+    ASSERT_SUCCESS(
+        hsa_amd_memory_pool_allocate(kern_arg_pool(), sizeof(local_args), 0, &kernarg_address));
+    ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, kernarg_address));
+
+    memcpy(kernarg_address, &local_args, sizeof(local_args));
+
+    // Dispatch more packets than queue size to force overflow and ensure that indices wrap around
+    int it = queue_size + 5;
+
+    for (int i = 0; i < it; i++) {
+      // Reserve a slot in the queue
+      uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1);
+
+      uint64_t curr_max = maxIndexSeen.load();
+      while (index > curr_max && !maxIndexSeen.compare_exchange_weak(curr_max, index)) {
+      }
+
+      // Get pointer to the reserved packet slot using wraparound masking
+      uint64_t wrapped_index = index & queue_mask;
+      hsa_kernel_dispatch_packet_t* queue_aql_packet =
+          &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address))[wrapped_index];
+
+      // Fill packet fields
+      queue_aql_packet->setup = 1;
+      queue_aql_packet->workgroup_size_x = 256;
+      queue_aql_packet->workgroup_size_y = 1;
+      queue_aql_packet->workgroup_size_z = 1;
+      queue_aql_packet->grid_size_x = 256;
+      queue_aql_packet->grid_size_y = 1;
+      queue_aql_packet->grid_size_z = 1;
+      queue_aql_packet->private_segment_size = 0;
+      queue_aql_packet->group_segment_size = 0;
+      queue_aql_packet->kernel_object = kernel_object();
+      queue_aql_packet->kernarg_address = kernarg_address;
+      queue_aql_packet->completion_signal = local_signal;
+
+      // Write header for packet
+      uint32_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
+      header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+      header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+      __atomic_store_n(reinterpret_cast<uint16_t*>(&queue_aql_packet->header), header,
+                       __ATOMIC_RELEASE);
+
+      // Ring doorbell to notify GPU
+      hsa_signal_store_screlease(queue->doorbell_signal, index);
+
+      // Wait for completion signal to be less than 1
+      while (hsa_signal_wait_scacquire(local_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
+                                       HSA_WAIT_STATE_ACTIVE)) {
+      }
+
+      // Reset signal for next iteration
+      hsa_signal_store_screlease(local_signal, 1);
+
+      // Verify results are still correct after wraparound
+      ASSERT_TRUE(VerifyResult(reinterpret_cast<uint32_t*>(local_dst_buffer), 256));
+    }
+
+    // Cleanup
+    hsa_amd_memory_pool_free(kernarg_address);
+    hsa_signal_destroy(local_signal);
+    hsa_amd_memory_pool_free(local_dst_buffer);
+
+    // Release the counted queue
+    ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
+  };
+
+  constexpr int kThreads = 2;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < kThreads; i++) {
+    threads.emplace_back(func);
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  // Verify value of max seen index
+  uint64_t maxId = maxIndexSeen.load();
+  EXPECT_EQ(maxId, (16384 + 5) * kThreads - 1);
+
+  hsa_amd_memory_pool_free(shared_src_buffer);
+}
@@ -0,0 +1,68 @@
+/*
+ * Copyright © Advanced Micro Devices, Inc., or its affiliates.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H
+#define ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H
+
+
+#include "suites/test_common/test_base.h"
+
+class CountedQueuesTest : public TestBase {
+ public:
+  explicit CountedQueuesTest();
+
+  // @Brief: Destructor for test case of CountedQueuesTest
+  virtual ~CountedQueuesTest();
+
+  // @Brief: Setup the environment for measurement
+  virtual void SetUp();
+
+  // @Brief: Core measurement execution
+  virtual void Run();
+
+  // @Brief: Clean up and retrive the resource
+  virtual void Close();
+
+  // @Brief: Display  results
+  virtual void DisplayResults() const;
+
+  // @Brief: Display information about what this test does
+  virtual void DisplayTestInfo(void);
+
+  /// @brief Basic API test to acquire, query and release 1 HW queue
+  void CountedQueueBasicApiTest();
+
+  /// @brief This test verifies that when many queues of the same priority are created, 
+  // they are evenly distributed across the limited set of hardware queues, reuse those 
+  // hardware queues correctly, and report proper use-counts before and after release
+  void CountedQueues_SamePriority_MaxLimitTest();
+
+  // @brief Test to verify HSA status codes for incorrect arguments sent via API
+  void InvalidArgsTest();
+
+  // @brief Test to verify that counted queues across all priorities each reuse only 
+  // their own priority-specific hardware queues
+  void CountedQueuesAllPrioritiesLimitTest();
+
+  /// @brief Test to verify hsa_amd_queue_set_priority() does not work on counted queues
+  void CountedQueuesSetPriorityNackTest();
+
+  /// @brief Test to verify hsa_amd_queue_cu_set_mask() does not work on counted queues
+  void CountedQueuesSetCUMaskNackTest();
+
+  /// @brief Test to verify that a counted queue correctly supports kernel dispatches end-to-end
+  void CountedQueuesDispatchTest();
+
+  /// @brief Test to verify kernel dispatches onto shared queues from multiple user apps even when they
+  // all share the same HW queue 
+  void CountedQueuesMultithreadedDispatchTest();
+
+  /// @brief Test to verify ring buffer wrap around when more than queue_size number of 
+  // AQL packets are enqueued
+  void CountedQueuesOverflowWrapAroundTest();
+};
+
+#endif  // ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H
@@ -82,6 +82,7 @@
 #include "suites/functional/filter_devices.h"
 #include "amd_smi/amdsmi.h"
 #include "common/common.h"
+#include "suites/functional/counted_queues.h"
 #include "common/os.h"

 static RocrTstGlobals *sRocrtstGlvalues = nullptr;
@@ -507,6 +508,69 @@ TEST(rocrtstFunc, Filter_Devices_Test) {
  );
 }

+TEST(rocrtstFunc, Counted_Queue_Basic_Test) {
+  CountedQueuesTest cq;
+  RunCustomTestProlog(&cq);
+  cq.CountedQueueBasicApiTest();
+  RunCustomTestEpilog(&cq);
+}
+
+TEST(rocrtstFunc, Counted_Queue_Same_Priority_Max_Limit_Test) {
+  CountedQueuesTest cq;
+  RunCustomTestProlog(&cq);
+  cq.CountedQueues_SamePriority_MaxLimitTest();
+  RunCustomTestEpilog(&cq);
+}
+
+TEST(rocrtstFunc, Counted_Queue_Invalid_Args_Test) {
+  CountedQueuesTest cq;
+  RunCustomTestProlog(&cq);
+  cq.InvalidArgsTest();
+  RunCustomTestEpilog(&cq);
+}
+
+TEST(rocrtstFunc, Counted_Queue_Multiple_Priorities_Limit_Test) {
+  CountedQueuesTest cq;
+  RunCustomTestProlog(&cq);
+  cq.CountedQueuesAllPrioritiesLimitTest();
+  RunCustomTestEpilog(&cq);
+}
+
+TEST(rocrtstFunc, Counted_Queue_Set_Priority_Nack_Test) {
+  CountedQueuesTest cq;
+  RunCustomTestProlog(&cq);
+  cq.CountedQueuesSetPriorityNackTest();
+  RunCustomTestEpilog(&cq);
+}
+
+TEST(rocrtstFunc, Counted_Queue_Set_CUMask_Nack_Test) {
+  CountedQueuesTest cq;
+  RunCustomTestProlog(&cq);
+  cq.CountedQueuesSetCUMaskNackTest();
+  RunCustomTestEpilog(&cq);
+}
+
+TEST(rocrtstFunc, Counted_Queue_Dispatch_Test) {
+  CountedQueuesTest cq;
+  RunCustomTestProlog(&cq);
+  cq.CountedQueuesDispatchTest();
+  RunCustomTestEpilog(&cq);
+}
+
+TEST(rocrtstFunc, Counted_Queue_Multithreaded_Dispatch_Test) {
+  CountedQueuesTest cq;
+  RunCustomTestProlog(&cq);
+  cq.CountedQueuesMultithreadedDispatchTest();
+  RunCustomTestEpilog(&cq);
+}
+
+TEST(rocrtstFunc, Counted_Queue_Overflow_And_Wraparound_Test) {
+  CountedQueuesTest cq;
+  RunCustomTestProlog(&cq);
+  cq.CountedQueuesOverflowWrapAroundTest();
+  RunCustomTestEpilog(&cq);
+}
+
 TEST(rocrtstNeg, Memory_Negative_Tests) {
  RUN_IF_NOT_EMU_MODE(
    MemoryAllocateNegativeTest mt;
@@ -54,6 +54,5 @@
    test;                                                                                          \
  }

-
 #endif  // ROCRTST_SUITES_TEST_COMMON_MAIN_H_

@@ -217,6 +217,7 @@ set ( SRCS core/driver/driver.cpp
           core/runtime/cache.cpp
           core/runtime/svm_profiler.cpp
           core/runtime/thunk_loader.cpp
+           core/runtime/counted_queue_manager.cpp
           core/common/hsa_table_interface.cpp
           loader/executable.cpp
           libamdhsacode/amd_elf_image.cpp
@@ -1345,6 +1345,18 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi
                                           size_copied, status);
 }

+hsa_status_t HSA_API hsa_amd_counted_queue_acquire(
+    hsa_agent_t agent, hsa_queue_type_t type, hsa_amd_queue_priority_t priority,
+    void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data,
+    uint64_t flags, hsa_queue_t** queue) {
+  return amdExtTable->hsa_amd_counted_queue_acquire_fn(agent, type, priority, callback, data, flags,
+                                                    queue);
+}
+
+hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue) {
+  return amdExtTable->hsa_amd_counted_queue_release_fn(queue);
+}
+
 // Tools only table interfaces.
 namespace rocr {

@@ -64,6 +64,28 @@
 extern r_debug _amdgpu_r_debug;

 namespace rocr {
+
+/// @brief Mapping between priority type used internally within ROCR to the type used by KFD
+
+// Highest queue priority allowed for HSA user is HSA_QUEUE_PRIORITY_HIGH
+// HSA_QUEUE_PRIORITY_MAXIMUM is reserved for PC Sampling and can only be allocated internally
+// in ROCR
+__forceinline HSA_QUEUE_PRIORITY HsaInternalToKfdPriority(
+    rocr::HSA::hsa_amd_queue_priority_internal_t priority) {
+  switch (priority) {
+    case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_LOW:
+      return HSA_QUEUE_PRIORITY_MINIMUM;
+    case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL:
+      return HSA_QUEUE_PRIORITY_NORMAL;
+    case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_HIGH:
+      return HSA_QUEUE_PRIORITY_HIGH;
+    case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_MAXIMUM:
+      return HSA_QUEUE_PRIORITY_MAXIMUM;
+    default:
+      return HSA_QUEUE_PRIORITY_NORMAL;
+  }
+}
+
 namespace AMD {

 #if defined(__linux__)
@@ -370,10 +392,13 @@ hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) {
 }

 hsa_status_t KfdDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
-                                    HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
+                                    HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id,
                                    void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
                                    HsaQueueResource& queue_resource) const {
-  if (HSAKMT_CALL(hsaKmtCreateQueueExt(node_id, type, queue_pct, priority, sdma_engine_id,
+  // Convert from ROCR internal priority type to KFD type
+  HSA_QUEUE_PRIORITY kfd_priority = HsaInternalToKfdPriority(priority);
+
+  if (HSAKMT_CALL(hsaKmtCreateQueueExt(node_id, type, queue_pct, kfd_priority, sdma_engine_id,
                                       queue_addr, queue_size_bytes, event, &queue_resource)) !=
      HSAKMT_STATUS_SUCCESS) {
    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
@@ -389,9 +414,12 @@ hsa_status_t KfdDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
 }

 hsa_status_t KfdDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
-                                    HSA_QUEUE_PRIORITY priority, void* queue_addr,
+                                    HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr,
                                    uint64_t queue_size, HsaEvent* event) const {
-  if (HSAKMT_CALL(hsaKmtUpdateQueue(queue_id, queue_pct, priority, queue_addr, queue_size,
+  // Convert from ROCR internal priority type to KFD type
+  HSA_QUEUE_PRIORITY kfd_priority = HsaInternalToKfdPriority(priority);
+
+  if (HSAKMT_CALL(hsaKmtUpdateQueue(queue_id, queue_pct, kfd_priority, queue_addr, queue_size,
                                    event)) != HSAKMT_STATUS_SUCCESS) {
    return HSA_STATUS_ERROR;
  }
@@ -430,7 +430,7 @@ hsa_status_t KfdVirtioDriver::MakeMemoryUnresident(const void* mem) const {
 }

 hsa_status_t KfdVirtioDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
-                                          HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
+                                          HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id,
                                          void* queue_addr, uint64_t queue_size_bytes,
                                          HsaEvent* event, HsaQueueResource& queue_resource) const {
  if (vhsaKmtCreateQueueExt(node_id, type, queue_pct, priority, sdma_engine_id, queue_addr,
@@ -447,7 +447,7 @@ hsa_status_t KfdVirtioDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
 }

 hsa_status_t KfdVirtioDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_percentage,
-                                          HSA_QUEUE_PRIORITY priority, void* queue_mem,
+                                          HSA::hsa_amd_queue_priority_internal_t priority, void* queue_mem,
                                          uint64_t queue_size, HsaEvent* event) const {
  return HSA_STATUS_ERROR;
 }
@@ -383,7 +383,7 @@ hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
 }

 hsa_status_t XdnaDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
-                                     HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
+                                     HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id,
                                     void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
                                     HsaQueueResource& queue_resource) const {
  queue_resource.QueueId = AMDXDNA_INVALID_CTX_HANDLE;
@@ -407,7 +407,7 @@ hsa_status_t XdnaDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
 }

 hsa_status_t XdnaDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
-                                     HSA_QUEUE_PRIORITY priority, void* queue_addr,
+                                     HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr,
                                     uint64_t queue_size, HsaEvent* event) const {
  // AIE doesn't support queue updates.
  return HSA_STATUS_ERROR_INVALID_QUEUE;
@@ -73,7 +73,7 @@ class AieAqlQueue : public core::Queue,
  ~AieAqlQueue();

  hsa_status_t Inactivate() override;
-  hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override;
+  hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override;
  void Destroy() override;
  uint64_t LoadReadIndexRelaxed() override;
  uint64_t LoadReadIndexAcquire() override;
@@ -73,7 +73,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
  hsa_status_t Inactivate() override;

  /// @brief Change the scheduling priority of the queue
-  hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override;
+  hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override;

  /// @brief Destroy ref counted queue
  void Destroy() override;
@@ -316,7 +316,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
  bool suspended_;

  // Thunk dispatch and wavefront scheduling priority
-  HSA_QUEUE_PRIORITY priority_;
+  HSA::hsa_amd_queue_priority_internal_t priority_;

  // Exception notification signal
  Signal* exception_signal_;
@@ -62,6 +62,7 @@
 #include "core/util/locks.h"
 #include "core/util/small_heap.h"
 #include "pcs/pcs_runtime.h"
+#include "core/inc/counted_queue_manager.h"

 namespace rocr {
 namespace AMD {
@@ -342,6 +343,16 @@ class GpuAgent : public GpuAgentInt {
  void AcquireQueueAltScratch(ScratchInfo& scratch) override;
  void ReleaseQueueAltScratch(ScratchInfo& scratch) override;

+  // @brief Create a pool of shared queues for multiple user applications within a max limit 
+  hsa_status_t AcquireCountedQueue(hsa_queue_type_t type,
+                                   HSA::hsa_amd_queue_priority_internal_t priority,
+                                   void (*callback)(hsa_status_t, hsa_queue_t*, void*),
+                                   void* data, uint64_t flags,
+                                   hsa_queue_t** out_queue);
+
+  // @brief Release a queue earlier used by application
+  hsa_status_t ReleaseCountedQueue(hsa_queue_t* queue);
+
  // @brief Override from AMD::GpuAgentInt.
  void TranslateTime(core::Signal* signal, hsa_amd_profiling_dispatch_time_t& time) override;

@@ -645,6 +656,9 @@ class GpuAgent : public GpuAgentInt {
  // @brief HSA profile.
  hsa_profile_t profile_;

+  // @brief Pool of shared queues owned by this agent
+  rocr::core::CountedQueuePoolManager queue_pool_;
+
  void* trap_code_buf_;

  size_t trap_code_buf_size_;
@@ -97,10 +97,10 @@ public:
                              uint32_t node_id) override;
  hsa_status_t FreeMemory(void *mem, size_t size) override;
  hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
-                           HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
+                           HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr,
                           uint64_t queue_size_bytes, HsaEvent* event,
                           HsaQueueResource& queue_resource) const override;
-  hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority,
+  hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA::hsa_amd_queue_priority_internal_t priority,
                           void* queue_addr, uint64_t queue_size, HsaEvent* event) const override;
  hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
  hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
@@ -90,12 +90,12 @@ class KfdVirtioDriver final : public core::Driver {
                                  const uint32_t* nodes) const override;
  hsa_status_t MakeMemoryUnresident(const void* mem) const override;
  hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
-                           HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
+                           HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr,
                           uint64_t queue_size_bytes, HsaEvent* event,
                           HsaQueueResource& queue_resource) const override;
  hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
  hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_percentage,
-                           HSA_QUEUE_PRIORITY priority, void* queue_mem, uint64_t queue_size,
+                           HSA::hsa_amd_queue_priority_internal_t priority, void* queue_mem, uint64_t queue_size,
                           HsaEvent* event) const override;
  hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t num_cu_mask,
                              uint32_t* cu_mask) const override;
@@ -204,10 +204,10 @@ public:
                              uint32_t node_id) override;
  hsa_status_t FreeMemory(void *mem, size_t size) override;
  hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
-                           HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
+                           HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr,
                           uint64_t queue_size_bytes, HsaEvent* event,
                           HsaQueueResource& queue_resource) const override;
-  hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority,
+  hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA::hsa_amd_queue_priority_internal_t priority,
                           void* queue_addr, uint64_t queue_size, HsaEvent* event) const override;
  hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
  hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
@@ -0,0 +1,68 @@
+/*
+ * Copyright © Advanced Micro Devices, Inc., or its affiliates.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef HSA_RUNTIME_CORE_INC_COUNTED_QUEUE_MANAGER_H_
+#define HSA_RUNTIME_CORE_INC_COUNTED_QUEUE_MANAGER_H_
+
+
+#include "hsa.h"
+#include "hsa_ext_amd.h"
+#include "core/inc/agent.h"
+#include "core/inc/runtime.h"
+#include <map>
+#include <mutex>
+#include <vector>
+#include <memory>
+
+namespace rocr {
+namespace core {
+
+// Wrapper for a logical counted queue (unique handle + callback)
+struct CountedQueue {
+  core::Queue* hw_queue; // this will store the public handle of HW Queue (hsa_queue_t)
+  void (*callback)(hsa_status_t, hsa_queue_t*, void*);
+  void* callback_data;
+
+  CountedQueue(core::Queue* hw, void (*cb)(hsa_status_t, hsa_queue_t*, void*), void* data)
+      : hw_queue(hw), callback(cb), callback_data(data) {}
+};
+
+// Manages the pool of counted queues for a single GPU agent
+class CountedQueuePoolManager {
+ public:
+  explicit CountedQueuePoolManager(core::Agent*);
+
+  // Acquire a queue (either reuse or create new)
+  hsa_status_t AcquireQueue(hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority,
+                            void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data,
+                            uint64_t flags, hsa_queue_t** out_queue);
+
+  // Release a logical queue
+  hsa_status_t ReleaseQueue(hsa_queue_t* queue);
+
+  // Called during hsa_shutdown to remove all user and CP queues
+  void Cleanup();
+
+ private:
+  core::Queue* FindOrCreateHardwareQueue(hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority,
+                                           void (*callback)(hsa_status_t, hsa_queue_t*, void*),
+                                           void* data, uint64_t flags);
+  
+  core::Agent* agent_; // pointer to the gpu agent that owns this pool
+  uint32_t max_hw_queues_;
+  std::mutex mutex_;
+
+  // Pool of hw queues by priority on the agent
+  std::map<HSA::hsa_amd_queue_priority_internal_t, std::vector<core::Queue*>> hw_queue_pools_;
+
+  // Map from unique handle to CountedQueue (hw queue, metadata per acquire request)
+  std::map<hsa_queue_t*, std::unique_ptr<CountedQueue>> counted_queues_;
+};
+
+}  // namespace core
+}  // namespace rocr
+
+#endif  // HSA_RUNTIME_CORE_INC_COUNTED_QUEUE_MANAGER_H_
@@ -50,6 +50,7 @@
 #include "core/inc/memory_region.h"
 #include "hsakmt/hsakmttypes.h"
 #include "inc/hsa.h"
+#include "core/inc/hsa_internal.h"

 namespace rocr {
 namespace core {
@@ -159,7 +160,7 @@ public:
  /// @param[in] event HsaEvent for event-driven callbacks.
  /// @param[out] queue_resource Queue resource information populated by the driver.
  virtual hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
-                                   HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
+                                   HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id,
                                   void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
                                   HsaQueueResource& queue_resource) const = 0;

@@ -175,7 +176,7 @@ public:
  /// @param[in] queue_size_bytes Size of the queue's ring buffer in bytes.
  /// @param[in] event HsaEvent for event-driven callbacks.
  virtual hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
-                                   HSA_QUEUE_PRIORITY priority, void* queue_addr,
+                                   HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr,
                                   uint64_t queue_size_bytes, HsaEvent* event) const = 0;

  /// @brief Set the CU mask for a queue.
@@ -60,7 +60,7 @@ class HostQueue : public Queue {
  ~HostQueue();

  hsa_status_t Inactivate() override { return HSA_STATUS_SUCCESS; }
-  hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override {
+  hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override {
    return HSA_STATUS_ERROR_INVALID_QUEUE;
  }

@@ -381,6 +381,15 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi
                                           uint64_t size, int64_t file_offset,
                                           uint64_t *size_copied, int32_t *status);

+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_counted_queue_acquire(hsa_agent_t agent, hsa_queue_type_t type,
+                                           hsa_amd_queue_priority_t priority,
+                                           void (*callback)(hsa_status_t status,
+                                                            hsa_queue_t* source, void* data),
+                                           void* data, uint64_t flags, hsa_queue_t** queue);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue);
+
 // Mirrors Amd Extension Apis
 hsa_status_t HSA_API hsa_amd_enable_logging(uint8_t* flags, void* file);

@@ -44,10 +44,18 @@
 #define HSA_RUNTIME_CORE_INC_HSA_INTERNAL_H

 #include "inc/hsa.h"
+#include "inc/hsa_ext_amd.h"

 namespace rocr {
 namespace HSA {

+  typedef enum hsa_amd_queue_priority_internal_s {
+    HSA_AMD_QUEUE_PRIORITY_LOW = HSA_AMD_QUEUE_PRIORITY_LOW,
+    HSA_AMD_QUEUE_PRIORITY_NORMAL = HSA_AMD_QUEUE_PRIORITY_NORMAL,
+    HSA_AMD_QUEUE_PRIORITY_HIGH = HSA_AMD_QUEUE_PRIORITY_HIGH,
+    HSA_AMD_QUEUE_PRIORITY_MAXIMUM  = HSA_AMD_QUEUE_PRIORITY_HIGH + 1,
+  } hsa_amd_queue_priority_internal_t;
+
  // Define core namespace interfaces - copy of function declarations in hsa.h
  hsa_status_t hsa_init();
  hsa_status_t hsa_shut_down();
@@ -66,7 +66,7 @@ class QueueWrapper : public Queue {
  explicit QueueWrapper(std::unique_ptr<Queue> queue)
      : Queue(static_cast<core::SharedQueue*>(core::Runtime::runtime_singleton_->system_allocator()(
                  sizeof(core::SharedQueue), 4096, 0, 0)),
-              0),
+              0, nullptr),
        wrapped(std::move(queue)) {
    memcpy(&amd_queue_, &wrapped->amd_queue_, sizeof(amd_queue_));
    wrapped->set_public_handle(wrapped.get(), public_handle_);
@@ -77,7 +77,7 @@ class QueueWrapper : public Queue {
  }

  hsa_status_t Inactivate() override { return wrapped->Inactivate(); }
-  hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override {
+  hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override {
    return wrapped->SetPriority(priority);
  }
  uint64_t LoadReadIndexAcquire() override { return wrapped->LoadReadIndexAcquire(); }
@@ -168,12 +168,15 @@ All funtions other than Convert and public_handle must be virtual.
 */
 class Queue : public Checked<0xFA3906A679F9DB49> {
 public:
-  Queue(SharedQueue* shared_queue, uint64_t queue_flags)
-      : Queue(shared_queue, queue_flags, false) {}
+  Queue(SharedQueue* shared_queue, uint64_t queue_flags, core::Agent* agent)
+      : Queue(shared_queue, queue_flags, false, agent) {}

-  Queue(SharedQueue* shared_queue, uint64_t queue_flags, bool pcie_write_ordering)
+  Queue(SharedQueue* shared_queue, uint64_t queue_flags, bool pcie_write_ordering, core::Agent* agent)
      : amd_queue_(shared_queue->amd_queue),
+        use_count(0),
+        is_counted_queue(false),
        shared_queue_(shared_queue),
+        agent_(agent),
        flags_(queue_flags),
        pcie_write_ordering_(pcie_write_ordering) {
    public_handle_ = Convert(this);
@@ -213,7 +216,7 @@ class Queue : public Checked<0xFA3906A679F9DB49> {
  virtual hsa_status_t Inactivate() = 0;

  /// @brief Change the scheduling priority of the queue
-  virtual hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) = 0;
+  virtual hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) = 0;

  /// @brief Reads the Read Index of Queue using Acquire semantics
  ///
@@ -372,6 +375,13 @@ class Queue : public Checked<0xFA3906A679F9DB49> {

  hsa_queue_t* public_handle() const { return public_handle_; }

+  // Get a pointer to the agent that owns this queue
+  core::Agent* GetAgent() { return agent_; }
+
+  // @brief Attributes specifically for counted queue types
+  uint32_t use_count;
+  bool is_counted_queue;
+
  typedef void* rtti_t;

  bool IsType(rtti_t id) { return _IsA(id); }
@@ -401,6 +411,8 @@ class Queue : public Checked<0xFA3906A679F9DB49> {

  SharedQueue* shared_queue_;

+  core::Agent* agent_; // pointer to the agent that owns this queue
+
  hsa_queue_t* public_handle_;

  /// Next available queue id.
@@ -67,7 +67,7 @@ namespace AMD {

 AieAqlQueue::AieAqlQueue(core::SharedQueue* shared_queue, AieAgent* agent, size_t req_size_pkts,
                         uint32_t node_id, uint64_t flags)
-    : Queue(shared_queue, flags),
+    : Queue(shared_queue, flags, agent),
      LocalSignal(0, false),
      DoorbellSignal(signal()),
      agent_(*agent),
@@ -104,7 +104,7 @@ AieAqlQueue::AieAqlQueue(core::SharedQueue* shared_queue, AieAgent* agent, size_

  HsaQueueResource queue_resource = {};
  hsa_status_t status =
-      agent_.driver().CreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 0, HSA_QUEUE_PRIORITY_NORMAL, 0,
+      agent_.driver().CreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 0, rocr::HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL, 0,
                                  nullptr, queue_size_bytes_, nullptr, queue_resource);
  if (status != HSA_STATUS_SUCCESS) {
    throw AMD::hsa_exception(status, "Failed to create a hardware context for an AIE queue.");
@@ -135,7 +135,7 @@ hsa_status_t AieAqlQueue::Inactivate() {
  return status;
 }

-hsa_status_t AieAqlQueue::SetPriority(HSA_QUEUE_PRIORITY priority) {
+hsa_status_t AieAqlQueue::SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) {
  return HSA_STATUS_SUCCESS;
 }

@@ -277,6 +277,13 @@ hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute,
      // Hardware doorbell supports AQL semantics.
      *static_cast<uint64_t*>(value) = reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
      break;
+    case HSA_QUEUE_INFO_USE_COUNT:
+      // AIE queues do not support counted queue features
+      *static_cast<uint32_t*>(value) = static_cast<uint32_t>(-1);
+      break;
+    case HSA_QUEUE_INFO_HW_ID:
+      *static_cast<uint32_t*>(value) = public_handle()->id;
+      break;
    default:
      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }
@@ -81,7 +81,7 @@ namespace AMD {
 AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_size_pkts,
                   HSAuint32 node_id, ScratchInfo& scratch, core::HsaEventCallback callback,
                   void* err_data, uint64_t flags)
-    : Queue(shared_queue, flags, !agent->is_xgmi_cpu_gpu()),
+    : Queue(shared_queue, flags, !agent->is_xgmi_cpu_gpu(), agent),
      LocalSignal(0, false),
      DoorbellSignal(signal()),
      ring_buf_(nullptr),
@@ -97,7 +97,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
      dynamicScratchState(0),
      exceptionState(0),
      suspended_(false),
-      priority_(HSA_QUEUE_PRIORITY_NORMAL),
+      priority_(HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL),
      exception_signal_(nullptr) {

  // Queue size is a function of several restrictions.
@@ -495,6 +495,21 @@ hsa_status_t AqlQueue::GetInfo(hsa_queue_info_attribute_t attribute, void* value
      *(reinterpret_cast<uint64_t*>(value)) =
          reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
      break;
+    case HSA_QUEUE_INFO_USE_COUNT:
+      if (!is_counted_queue) {
+        *static_cast<uint32_t*>(value) = static_cast<uint32_t>(-1);
+      } else {
+        if (use_count == 0) {
+          // Queue was released
+          return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+        }
+        *static_cast<uint32_t*>(value) = use_count;
+      }
+      break;
+    case HSA_QUEUE_INFO_HW_ID:
+      // Return the hardware queue ID for both counted and non-counted queues
+      *static_cast<uint32_t*>(value) = public_handle()->id; 
+      break;
    default:
      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }
@@ -631,7 +646,7 @@ hsa_status_t AqlQueue::Inactivate() {
  return HSA_STATUS_SUCCESS;
 }

-hsa_status_t AqlQueue::SetPriority(HSA_QUEUE_PRIORITY priority) {
+hsa_status_t AqlQueue::SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) {
  if (suspended_) {
    return HSA_STATUS_ERROR_INVALID_QUEUE;
  }
@@ -184,7 +184,7 @@ hsa_status_t BlitSdma<useGCR>::Initialize(const core::Agent& agent, bool use_xgm
  // boolean flag
  const HSA_QUEUE_TYPE kQueueType_ = rec_eng >= 0 ? HSA_QUEUE_SDMA_BY_ENG_ID :
                                     (use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA);
-  if (agent_->driver().CreateQueue(agent_->node_id(), kQueueType_, 100, HSA_QUEUE_PRIORITY_MAXIMUM,
+  if (agent_->driver().CreateQueue(agent_->node_id(), kQueueType_, 100, HSA::HSA_AMD_QUEUE_PRIORITY_MAXIMUM,
                                   rec_eng, queue_start_addr_, kQueueSize, nullptr,
                                   queue_resource_) != HSA_STATUS_SUCCESS) {
    LogPrint(HSA_AMD_LOG_FLAG_INFO, "Failed to create queue, size=%d, type=%d,"
@@ -99,6 +99,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
      current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
      scratch_used_large_(0),
      queues_(),
+      queue_pool_(this),
      trap_code_buf_(NULL),
      trap_code_buf_size_(0),
      doorbell_queue_map_(NULL),
@@ -774,13 +775,13 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) {

 void GpuAgent::InitDma() {
  // Setup lazy init pointers on queues and blits.
-  auto queue_lambda = [this](HSA_QUEUE_PRIORITY priority = HSA_QUEUE_PRIORITY_NORMAL) {
+  auto queue_lambda = [this](HSA::hsa_amd_queue_priority_internal_t priority = HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL) {
    auto queue = CreateInterceptibleQueue();
    if (queue == nullptr)
      throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
                               "Internal queue creation failed.");

-    if (priority != HSA_QUEUE_PRIORITY_NORMAL)
+    if (priority != HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL)
      if (queue->SetPriority(priority) != HSA_STATUS_SUCCESS)
        throw AMD::hsa_exception(HSA_STATUS_ERROR,
                                "Failed to increase queue priority for PC Sampling");
@@ -795,7 +796,7 @@ void GpuAgent::InitDma() {
  // Dedicated compute queue for PC Sampling CP-DMA commands. We need a dedicated queue that runs at
  // highest priority because we do not want the CP-DMA commands to be delayed/blocked due to
  // other dispatches/barriers that could be in the other AQL queues.
-  queues_[QueuePCSampling].reset([queue_lambda]() { return queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM); });
+  queues_[QueuePCSampling].reset([queue_lambda]() { return queue_lambda(HSA::HSA_AMD_QUEUE_PRIORITY_MAXIMUM); });

  // Decide which engine to use for blits.
  auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev, uint32_t rec_eng) {
@@ -924,6 +925,10 @@ void GpuAgent::PreloadBlits() {
 void GpuAgent::ReleaseResources() {
  if (this->Enabled()) {
    this->Disable();
+
+    // Remove all shared hardware queues from pool
+    queue_pool_.Cleanup();
+
    for (auto& blit : blits_) {
      if (!blit.empty()) {
        hsa_status_t status = blit->Destroy();
@@ -1829,6 +1834,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u

  auto aql_queue = new AqlQueue(shared_queue, this, size, node_id(), scratch, event_callback, data,
                                flags);
+  
  *queue = aql_queue;
  aql_queues_.push_back(aql_queue);

@@ -3519,5 +3525,17 @@ hsa_status_t GpuAgent::PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& sessi
  return HSA_STATUS_SUCCESS;
 }

+hsa_status_t GpuAgent::AcquireCountedQueue(hsa_queue_type_t type,
+                                           HSA::hsa_amd_queue_priority_internal_t priority,
+                                           void (*callback)(hsa_status_t, hsa_queue_t*, void*),
+                                           void* data, uint64_t flags,
+                                           hsa_queue_t** out_queue) {
+  return queue_pool_.AcquireQueue(type, priority, callback, data, flags, out_queue);
+}
+
+hsa_status_t GpuAgent::ReleaseCountedQueue(hsa_queue_t* queue) {
+  return queue_pool_.ReleaseQueue(queue);
+}
+
 }  // namespace amd
 }  // namespace rocr
@@ -0,0 +1,145 @@
+/*
+ * Copyright © Advanced Micro Devices, Inc., or its affiliates.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "core/inc/counted_queue_manager.h"
+#include "core/inc/agent.h"
+#include "core/inc/runtime.h"
+
+namespace rocr {
+namespace core {
+
+constexpr size_t DEFAULT_QUEUE_SIZE = 16384;
+
+CountedQueuePoolManager::CountedQueuePoolManager(core::Agent* agent) : agent_(agent) {
+  // Read in GPU_MAX_HW_QUEUES flag value
+  max_hw_queues_ = core::Runtime::runtime_singleton_->flag().cp_queues_limit();
+}
+
+hsa_status_t CountedQueuePoolManager::AcquireQueue(
+    hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority,
+    void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data, uint64_t flags,
+    hsa_queue_t** out_queue) {
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  core::Queue* core_queue = FindOrCreateHardwareQueue(type, priority, callback, data, flags);
+  if (!core_queue) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+  // Create unique SharedQueue structure and store the unique handle in it
+  SharedQueue* shared_queue = new (std::nothrow) SharedQueue();
+  if (!shared_queue) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+  // Copy amd_queue from HW queue
+  shared_queue->amd_queue = core_queue->amd_queue_;
+
+  // Point to the SAME underlying core::Queue (shared HW queue)
+  shared_queue->core_queue = core_queue;
+
+  // Create a unique handle from this new SharedQueue
+  hsa_queue_t* unique_handle = &shared_queue->amd_queue.hsa_queue;
+
+  // Track metadata
+  auto counted_q = std::make_unique<CountedQueue>(core_queue, callback, data);
+  counted_queues_[unique_handle] = std::move(counted_q);
+
+  // Increment use count
+  core_queue->use_count++;
+  
+  // Mark as a counted queue, if not already set
+  if (!core_queue->is_counted_queue) {
+    core_queue->is_counted_queue = true;
+  }
+
+  *out_queue = unique_handle;
+  return HSA_STATUS_SUCCESS;
+}
+
+core::Queue* CountedQueuePoolManager::FindOrCreateHardwareQueue(
+    hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority,
+    void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data, uint64_t flags) {
+  auto& pool = hw_queue_pools_[priority];
+
+  // Reuse least-used queue if max reached
+  if (pool.size() >= max_hw_queues_) {
+    core::Queue* least_used = nullptr;
+    uint32_t min_count = UINT32_MAX;
+
+    for (auto* q : pool) {
+      if (q->use_count < min_count) {
+        min_count = q->use_count;
+        least_used = q;
+      }
+    }
+    return least_used;
+  }
+
+  // Create a new hardware queue
+  core::Queue* cmd_queue = nullptr;
+  hsa_status_t status =
+      agent_->QueueCreate(DEFAULT_QUEUE_SIZE, type, 0, callback, data, 0, 0, &cmd_queue);
+  if (status != HSA_STATUS_SUCCESS) return nullptr;
+
+  status = cmd_queue->SetPriority(priority);
+  if (status != HSA_STATUS_SUCCESS) return nullptr;
+
+  cmd_queue->SetProfiling(true);
+
+  // Add to pool
+  pool.push_back(cmd_queue);
+  return cmd_queue;
+}
+
+hsa_status_t CountedQueuePoolManager::ReleaseQueue(hsa_queue_t* queue) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto it = counted_queues_.find(queue);
+  if (it == counted_queues_.end()) return HSA_STATUS_ERROR;
+
+  CountedQueue* counted_q = it->second.get();
+
+  // Decrement internal ref count inside core::Queue object
+  if (counted_q->hw_queue->use_count > 0) {
+    counted_q->hw_queue->use_count--;
+    
+    // Remove unique handle from map when it is no longer in use by an application
+    if (counted_q->hw_queue->use_count == 0) {
+      counted_queues_.erase(queue);
+
+      // free the associated shared_queue when removing the counted_queue
+      SharedQueue* shared = reinterpret_cast<SharedQueue*>(
+        reinterpret_cast<char*>(queue) - offsetof(SharedQueue, amd_queue.hsa_queue));
+      delete shared;
+    }
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+void CountedQueuePoolManager::Cleanup() {
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  // Destroy hardware queues
+  for (auto& priority_pool : hw_queue_pools_) {
+    for (auto* hw_queue : priority_pool.second) {
+      if (hw_queue) {
+        hw_queue->Destroy();
+      }
+    }
+    priority_pool.second.clear();
+  }
+  hw_queue_pools_.clear();
+
+  // Clean up counted and shared queues
+  for (auto& cq : counted_queues_) {
+    // Recover SharedQueue from unique handle and free memory
+    hsa_queue_t* queue_handle = cq.first;
+    SharedQueue* shared = reinterpret_cast<SharedQueue*>(
+        reinterpret_cast<char*>(queue_handle) - offsetof(SharedQueue, amd_queue.hsa_queue));
+    delete shared;
+  }
+  counted_queues_.clear();
+}
+
+}  // namespace core
+}  // namespace rocr
@@ -50,7 +50,7 @@ namespace core {

 HostQueue::HostQueue(core::SharedQueue* shared_queue, hsa_region_t region, uint32_t ring_size,
                     hsa_queue_type32_t type, uint32_t features, hsa_signal_t doorbell_signal)
-    : Queue(shared_queue, 0), size_(ring_size) {
+    : Queue(shared_queue, 0, nullptr), size_(ring_size) {
  HSA::hsa_memory_register(this, sizeof(HostQueue));
  MAKE_NAMED_SCOPE_GUARD(registerGuard,
                         [&]() { HSA::hsa_memory_deregister(this, sizeof(HostQueue)); });
@@ -87,7 +87,7 @@ void HsaApiTable::Init() {
  // they can add preprocessor macros on the new functions

  constexpr size_t expected_core_api_table_size = 1016;
-  constexpr size_t expected_amd_ext_table_size = 624;
+  constexpr size_t expected_amd_ext_table_size = 640;
  constexpr size_t expected_image_ext_table_size = 128;
  constexpr size_t expected_finalizer_ext_table_size = 64;
  constexpr size_t expected_tools_table_size = 64;
@@ -476,6 +476,8 @@ void HsaApiTable::UpdateAmdExts() {
  amd_ext_api.hsa_amd_enable_logging_fn = AMD::hsa_amd_enable_logging;
  amd_ext_api.hsa_amd_ais_file_write_fn = AMD::hsa_amd_ais_file_write;
  amd_ext_api.hsa_amd_ais_file_read_fn = AMD::hsa_amd_ais_file_read;
+  amd_ext_api.hsa_amd_counted_queue_acquire_fn = AMD::hsa_amd_counted_queue_acquire;
+  amd_ext_api.hsa_amd_counted_queue_release_fn = AMD::hsa_amd_counted_queue_release;
  amd_ext_api.hsa_amd_signal_wait_all_fn = AMD::hsa_amd_signal_wait_all;
  amd_ext_api.hsa_amd_memory_get_preferred_copy_engine_fn = AMD::hsa_amd_memory_get_preferred_copy_engine;
  amd_ext_api.hsa_amd_portable_export_dmabuf_v2_fn = AMD::hsa_amd_portable_export_dmabuf_v2;
@@ -62,6 +62,7 @@
 #include "core/inc/ipc_signal.h"
 #include "core/inc/runtime.h"
 #include "core/inc/signal.h"
+#include "core/inc/counted_queue_manager.h"

 namespace rocr {

@@ -714,6 +715,10 @@ hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu

  core::Queue* cmd_queue = core::Queue::Convert(queue);
  IS_VALID(cmd_queue);
+
+  // Check if this a counted queue; NACK if it is
+  if (cmd_queue->is_counted_queue) return HSA_STATUS_ERROR_INVALID_QUEUE;
+
  if (num_cu_mask_count != 0) IS_BAD_PTR(cu_mask);
  if (num_cu_mask_count % 32 != 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  return cmd_queue->SetCUMasking(num_cu_mask_count, cu_mask);
@@ -1183,22 +1188,13 @@ hsa_status_t hsa_amd_queue_set_priority(hsa_queue_t* queue,
  core::Queue* cmd_queue = core::Queue::Convert(queue);
  IS_VALID(cmd_queue);

-  // Highest queue priority allowed for HSA user is HSA_QUEUE_PRIORITY_HIGH
-  // HSA_QUEUE_PRIORITY_MAXIMUM is reserved for PC Sampling and can only be allocated internally
-  // in ROCR
-  static std::map<hsa_amd_queue_priority_t, HSA_QUEUE_PRIORITY> ext_kmt_priomap = {
-      {HSA_AMD_QUEUE_PRIORITY_LOW, HSA_QUEUE_PRIORITY_MINIMUM},
-      {HSA_AMD_QUEUE_PRIORITY_NORMAL, HSA_QUEUE_PRIORITY_NORMAL},
-      {HSA_AMD_QUEUE_PRIORITY_HIGH, HSA_QUEUE_PRIORITY_HIGH},
-  };
+  // Check if this a counted queue; NACK if it is                                                
+  if (cmd_queue->is_counted_queue) return HSA_STATUS_ERROR_INVALID_QUEUE;

-  auto priority_it = ext_kmt_priomap.find(priority);
-
-  if (priority_it == ext_kmt_priomap.end()) {
-    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
-  }
-
-  return cmd_queue->SetPriority(priority_it->second);
+  // Convert to ROCR internal priority type
+  HSA::hsa_amd_queue_priority_internal_t priority_ = static_cast<HSA::hsa_amd_queue_priority_internal_t>(priority);
+  
+  return cmd_queue->SetPriority(priority_);
  CATCH;
 }

@@ -1532,7 +1528,7 @@ hsa_status_t HSA_API hsa_amd_queue_get_info(hsa_queue_t* _queue,

  core::Queue* queue = core::Queue::Convert(_queue);
  IS_VALID(queue);
-
+  
  return queue->GetInfo(attribute, value);
  CATCH;
 }
@@ -1576,6 +1572,72 @@ hsa_status_t hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, void *devic
  CATCH;
 }

+hsa_status_t HSA_API
+hsa_amd_counted_queue_acquire(hsa_agent_t agent,
+                              hsa_queue_type_t type,
+                              hsa_amd_queue_priority_t priority,
+                              void (*callback)(hsa_status_t status,
+                                               hsa_queue_t* source,
+                                               void* data),
+                              void* data,
+                              uint64_t flags,
+                              hsa_queue_t** queue) {
+  TRY;
+  IS_OPEN();
+  // Basic validation
+  if (queue == nullptr) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  // Check priority 
+  if (priority < HSA_AMD_QUEUE_PRIORITY_LOW || priority > HSA_AMD_QUEUE_PRIORITY_HIGH) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  // Only support multi-producer queues
+  if (type != HSA_QUEUE_TYPE_MULTI) {
+    return HSA_STATUS_ERROR_INVALID_QUEUE_CREATION;
+  }
+
+  // Convert handle to internal agent
+  core::Agent* core_agent = core::Agent::Convert(agent);
+  IS_VALID(core_agent);
+  if (core_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) {
+    return HSA_STATUS_ERROR_INVALID_AGENT;
+  }
+  AMD::GpuAgent* gpu_agent = static_cast<AMD::GpuAgent*>(core_agent);
+
+  // Convert to ROCR internal priority type 
+  HSA::hsa_amd_queue_priority_internal_t priority_ = static_cast<HSA::hsa_amd_queue_priority_internal_t>(priority);
+  
+  // Call the queue pool manager
+  return gpu_agent->AcquireCountedQueue(type, priority_, callback, data, flags, queue);
+  CATCH;
+}
+
+hsa_status_t HSA_API
+hsa_amd_counted_queue_release(hsa_queue_t* queue) {
+  TRY;
+  IS_OPEN();   
+  // Basic validation                           
+  if (queue == nullptr) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  core::Queue* core_queue = core::Queue::Convert(queue);
+  IS_VALID(core_queue);
+
+  core::Agent* core_agent = core_queue->GetAgent();
+  IS_VALID(core_agent);
+  if (core_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) {
+    return HSA_STATUS_ERROR_INVALID_AGENT;
+  }
+  AMD::GpuAgent* gpu_agent = static_cast<AMD::GpuAgent*>(core_agent);
+
+  return gpu_agent->ReleaseCountedQueue(queue);
+  CATCH;
+}
+
 hsa_status_t hsa_amd_enable_logging(uint8_t* flags, void *file) {
  TRY;
  return core::Runtime::runtime_singleton_->EnableLogging(flags, file);
@@ -438,7 +438,9 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) {
 hsa_status_t InterceptQueue::GetInfo(hsa_queue_info_attribute_t attribute, void* value) {
  switch (attribute) {
    case HSA_AMD_QUEUE_INFO_AGENT:
-    case HSA_AMD_QUEUE_INFO_DOORBELL_ID: {
+    case HSA_AMD_QUEUE_INFO_DOORBELL_ID: 
+    case HSA_QUEUE_INFO_USE_COUNT:
+    case HSA_QUEUE_INFO_HW_ID: {
      if (!AMD::AqlQueue::IsType(wrapped.get())) return HSA_STATUS_ERROR_INVALID_QUEUE;

      AMD::AqlQueue* aqlQueue = static_cast<AMD::AqlQueue*>(wrapped.get());
@@ -306,6 +306,8 @@ class Flag {
    core_dump_disable_ = (var == "1");

    core_dump_pattern_ = os::GetEnvVar("HSA_COREDUMP_PATTERN");
+    var = os::GetEnvVar("GPU_MAX_HW_QUEUES");
+    cp_queues_limit_ = var.empty() ? 4 : atoi(var.c_str());
  }

  void parse_masks(uint32_t maxGpu, uint32_t maxCU) {
@@ -426,6 +428,8 @@ class Flag {

  size_t co_dmacopy_size() const { return co_dmacopy_size_; }

+  uint32_t cp_queues_limit() const { return cp_queues_limit_; }
+
  bool dev_mem_queue_buf() const { return dev_mem_queue_buf_; }

  uint32_t signal_abort_timeout() const { return signal_abort_timeout_; }
@@ -545,6 +549,8 @@ class Flag {
  bool enable_core_dump_progress_ = false;
  std::string core_dump_pattern_;

+  uint32_t cp_queues_limit_;
+
  // Map GPU index post RVD to its default cu mask.
  std::map<uint32_t, std::vector<uint32_t>> cu_mask_;

@@ -266,6 +266,8 @@ global:
 	hsa_ext_image_create_v2;
 	hsa_ext_image_data_get_info_v2;
 	hsa_ext_image_destroy_v2;
+	hsa_amd_counted_queue_acquire;
+	hsa_amd_counted_queue_release;
 local:
    *;
 };
@@ -273,6 +273,8 @@ struct AmdExtTable {
  decltype(hsa_amd_portable_export_dmabuf_v2)* hsa_amd_portable_export_dmabuf_v2_fn;
  decltype(hsa_amd_ais_file_write)* hsa_amd_ais_file_write_fn;
  decltype(hsa_amd_ais_file_read)* hsa_amd_ais_file_read_fn;
+  decltype(hsa_amd_counted_queue_acquire)* hsa_amd_counted_queue_acquire_fn;
+  decltype(hsa_amd_counted_queue_release)* hsa_amd_counted_queue_release_fn;
 };

 // Table to export HSA Core Runtime Apis
@@ -58,7 +58,7 @@
 // Step Ids of the Api tables exported by Hsa Core Runtime
 #define HSA_API_TABLE_STEP_VERSION                  0x01
 #define HSA_CORE_API_TABLE_STEP_VERSION             0x00
-#define HSA_AMD_EXT_API_TABLE_STEP_VERSION          0x08
+#define HSA_AMD_EXT_API_TABLE_STEP_VERSION          0x09
 #define HSA_FINALIZER_API_TABLE_STEP_VERSION        0x00
 #define HSA_IMAGE_API_TABLE_STEP_VERSION            0x01
 // Rocprofiler just checks HSA_MAGE_EXT_API_TABLE_STEP_VERSION
@@ -68,7 +68,7 @@
 * - 1.15 - hsa_amd_register_system_event_handler: HSA_AMD_SYSTEM_SHUTDOWN
 */
 #define HSA_AMD_INTERFACE_VERSION_MAJOR 1
-#define HSA_AMD_INTERFACE_VERSION_MINOR 15
+#define HSA_AMD_INTERFACE_VERSION_MINOR 16

 #ifdef __cplusplus
 extern "C" {
@@ -3666,6 +3666,18 @@ typedef enum {
   * The type of this attribute is uint64_t.
   */
  HSA_AMD_QUEUE_INFO_DOORBELL_ID,
+  /*
+  * Returns how many times the underlying hardware queue has been shared.
+  * @p value will be set to -1 if this queue was not allocated using 
+  * hsa_amd_counted_queue_acquire. The type of this attribute is uint32_t.
+  */
+  HSA_QUEUE_INFO_USE_COUNT,
+  /*
+  * Returns a unique ID representing the HW resource used by a counted queue. Two queues
+  * with the same HW_ID use the same underlying hardware queue. This query can be
+  * used on counted and non-counted queues. The type of this attribute is uint32_t.
+  */
+  HSA_QUEUE_INFO_HW_ID,
 } hsa_queue_info_attribute_t;

 hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute,
@@ -3760,6 +3772,81 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi
                                           uint64_t size, int64_t file_offset,
                                           uint64_t *size_copied, int32_t *status);

+/**
+ * @brief Create a queue that is limited by the GPU_MAX_HW_QUEUES environment variable.
+ *
+ * This underlying hardware queue returned by this function may be shared by other queues. For
+ * each priority this function will create a new hardware queue as long as the number of queues
+ * is less than GPU_MAX_HW_QUEUES. Once the GPU_MAX_HW_QUEUES limit is reached, this function will
+ * stop creating new hardware queues and return a reference to an existing queue of the requested
+ * @p priority instead. Each successful call will return a different @p queue handle.
+ * The hsa_amd_queue_get_info API can be used to determine whether this queue is currently shared.
+ *
+ * When there are multiple eligible hardware queues available, the queue with the lowest
+ * HSA_QUEUE_INFO_USE_COUNT will be returned.
+ *
+ * For each successful call, hsa_amd_counted_queue_release should be called to release the
+ * HSA_QUEUE_INFO_USE_COUNT. After release, the queue handle becomes invalid and must not be used.
+ * 
+ * hsa_amd_queue_set_priority and hsa_amd_queue_cu_set_mask cannot be used on counted queues.
+ *
+ * @param[in] agent Agent where to create the queue
+ *
+ * @param[in] type  For future use. HSA_QUEUE_TYPE_MULTI is the only valid option.
+ *
+ * @param[in] priority Associated priority. The GPU_MAX_HW_QUEUES limit is counted for each priority
+ *
+ * @param[in] callback callback invoked by the HSA runtime for every asynchronous event related to
+ * the newly created queue. May be NULL. The HSA runtime passes three arguments to the callback: a
+ * code identifying the event that triggered the invocation, a pointer to the queue where the event
+ * originated, and the application data.
+ *
+ * @param[in] data Application data that is passed to @p callback on every iteration. May be NULL.
+ *
+ * @param[in] flags For future use. Ignored.
+ *
+ * @param[out] queue Unique handle to reference the newly created queue.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is failure to allocate the resources required
+ * by the implementation.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid or not a GPU agent.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p type is not HSA_QUEUE_TYPE_MULTI.
+ * 
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid priority or NULL queue pointer.
+ */
+hsa_status_t HSA_API hsa_amd_counted_queue_acquire(hsa_agent_t agent, hsa_queue_type_t type,
+                                           hsa_amd_queue_priority_t priority,
+                                           void (*callback)(hsa_status_t status,
+                                                            hsa_queue_t* source, void* data),
+                                           void* data, uint64_t flags, hsa_queue_t** queue);
+
+/**
+ * @brief Release a counted queue and decrements its use count.
+ * 
+ * Releases a queue that was previously acquired using hsa_amd_counted_queue_acquire.
+ * Each call to this API decrements the internal use count HSA_QUEUE_INFO_USE_COUNT 
+ * of the underlying hardware. After this call, queue handle is invalid and must not be used.
+ * Once created, the hardware queue is retained until hsa_shutdown is called to avoid costly
+ * overhead of repeatedly creating new hardware queues, allowing them to be reused. 
+ * 
+ *
+ * @param[in] queue Counted queue handle returned from hsa_amd_counted_queue_acquire.
+ * Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ * @retval ::HSA_STATUS_ERROR Invalid queue or queue was already released.
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized.
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The @p queue is NULL or previously released.
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The queue's agent is invalid or not a GPU agent.
+ */
+hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue);
+
 /**
 * @brief logging types
 */