diff --git a/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc b/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc
index da65940777..060b82cb74 100644
--- a/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc
+++ b/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc
@@ -718,6 +718,7 @@ void CountedQueuesTest::CountedQueuesOverflowWrapAroundTest() {
 
   // To verify that after the queue has been used up, next index wraps around
   std::atomic<uint64_t> maxIndexSeen{0};
+  std::atomic<uint32_t> countedQueueSize{0};
 
   auto func = [&]() {
     // local dest buffer for each user application
@@ -746,6 +747,8 @@ void CountedQueuesTest::CountedQueuesOverflowWrapAroundTest() {
     uint32_t queue_size = queue->size;           // should be 16384
     const uint32_t queue_mask = queue_size - 1;  // used for index wraparound
 
+    countedQueueSize.store(queue_size);
+
     struct __attribute__((aligned(16))) local_args_t {
       uint32_t* dstArray;
       uint32_t* srcArray;
@@ -847,9 +850,9 @@ void CountedQueuesTest::CountedQueuesOverflowWrapAroundTest() {
     th.join();
   }
 
-  // Verify value of max seen index
+  // Verify value of max seen index based on counted queue size
   uint64_t maxId = maxIndexSeen.load();
-  EXPECT_EQ(maxId, (16384 + 5) * kThreads - 1);
+  EXPECT_EQ(maxId, (countedQueueSize.load() + 5) * kThreads - 1);
 
   hsa_amd_memory_pool_free(shared_src_buffer);
 }
\ No newline at end of file
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/counted_queue_manager.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/counted_queue_manager.h
index e7b05dc318..5d12866b55 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/counted_queue_manager.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/counted_queue_manager.h
@@ -53,6 +53,7 @@ class CountedQueuePoolManager {
   
   core::Agent* agent_; // pointer to the gpu agent that owns this pool
   uint32_t max_hw_queues_;
+  size_t counted_queue_size_;
   std::mutex mutex_;
 
   // Pool of hw queues by priority on the agent
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/counted_queue_manager.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/counted_queue_manager.cpp
index 675129e776..4b4edfa21f 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/counted_queue_manager.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/counted_queue_manager.cpp
@@ -11,11 +11,10 @@
 namespace rocr {
 namespace core {
 
-constexpr size_t DEFAULT_QUEUE_SIZE = 16384;
-
 CountedQueuePoolManager::CountedQueuePoolManager(core::Agent* agent) : agent_(agent) {
-  // Read in GPU_MAX_HW_QUEUES flag value
+  // Read in GPU_MAX_HW_QUEUES and HSA_COUNTED_QUEUE_SIZE flags
   max_hw_queues_ = core::Runtime::runtime_singleton_->flag().cp_queues_limit();
+  counted_queue_size_ = core::Runtime::runtime_singleton_->flag().counted_queue_size();
 }
 
 hsa_status_t CountedQueuePoolManager::AcquireQueue(
@@ -78,7 +77,7 @@ core::Queue* CountedQueuePoolManager::FindOrCreateHardwareQueue(
   // Create a new hardware queue
   core::Queue* cmd_queue = nullptr;
   hsa_status_t status =
-      agent_->QueueCreate(DEFAULT_QUEUE_SIZE, type, 0, callback, data, 0, 0, &cmd_queue);
+      agent_->QueueCreate(counted_queue_size_, type, 0, callback, data, 0, 0, &cmd_queue);
   if (status != HSA_STATUS_SUCCESS) return nullptr;
 
   status = cmd_queue->SetPriority(priority);
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h
index 4a283b9e12..126c7c52a9 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h
@@ -54,6 +54,9 @@
 
 namespace rocr {
 
+constexpr size_t DEFAULT_COUNTED_QUEUE_SIZE = 16384;
+constexpr uint32_t DEFAULT_GPU_HW_QUEUES_MAX = 4;
+
 class Flag {
  public:
   enum SDMA_OVERRIDE { SDMA_DISABLE, SDMA_ENABLE, SDMA_DEFAULT };
@@ -306,8 +309,16 @@ class Flag {
     core_dump_disable_ = (var == "1");
 
     core_dump_pattern_ = os::GetEnvVar("HSA_COREDUMP_PATTERN");
+
+    // This limits the maximum number of hardware queues that can be created per 
+    // priority level for counted queues on every GPU agent. By default, the limit is set to 4.
     var = os::GetEnvVar("GPU_MAX_HW_QUEUES");
-    cp_queues_limit_ = var.empty() ? 4 : atoi(var.c_str());
+    cp_queues_limit_ = var.empty() ? DEFAULT_GPU_HW_QUEUES_MAX : atoi(var.c_str());
+
+    // This allows configuring the size of counted queues created through 
+    // hsa_amd_counted_queue_acquire API. If not set, default queue size is set to 16384.
+    var = os::GetEnvVar("HSA_COUNTED_QUEUE_SIZE");
+    counted_queue_size_ = var.empty() ? DEFAULT_COUNTED_QUEUE_SIZE : atoi(var.c_str());
   }
 
   void parse_masks(uint32_t maxGpu, uint32_t maxCU) {
@@ -430,6 +441,8 @@ class Flag {
 
   uint32_t cp_queues_limit() const { return cp_queues_limit_; }
 
+  size_t counted_queue_size() const { return counted_queue_size_; }
+
   bool dev_mem_queue_buf() const { return dev_mem_queue_buf_; }
 
   uint32_t signal_abort_timeout() const { return signal_abort_timeout_; }
@@ -550,6 +563,7 @@ class Flag {
   std::string core_dump_pattern_;
 
   uint32_t cp_queues_limit_;
+  size_t counted_queue_size_;
 
   // Map GPU index post RVD to its default cu mask.
   std::map<uint32_t, std::vector<uint32_t>> cu_mask_;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h
index bcf9a77021..9ba159686f 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h
@@ -3793,6 +3793,7 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi
  * @param[in] agent Agent where to create the queue
  *
  * @param[in] type  For future use. HSA_QUEUE_TYPE_MULTI is the only valid option.
+ * HSA_QUEUE_TYPE_COOPERATIVE queues are not supported.
  *
  * @param[in] priority Associated priority. The GPU_MAX_HW_QUEUES limit is counted for each priority
  *