diff --git a/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc b/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc index da65940777..060b82cb74 100644 --- a/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc +++ b/projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc @@ -718,6 +718,7 @@ void CountedQueuesTest::CountedQueuesOverflowWrapAroundTest() { // To verify that after the queue has been used up, next index wraps around std::atomic maxIndexSeen{0}; + std::atomic countedQueueSize{0}; auto func = [&]() { // local dest buffer for each user application @@ -746,6 +747,8 @@ void CountedQueuesTest::CountedQueuesOverflowWrapAroundTest() { uint32_t queue_size = queue->size; // should be 16384 const uint32_t queue_mask = queue_size - 1; // used for index wraparound + countedQueueSize.store(queue_size); + struct __attribute__((aligned(16))) local_args_t { uint32_t* dstArray; uint32_t* srcArray; @@ -847,9 +850,9 @@ void CountedQueuesTest::CountedQueuesOverflowWrapAroundTest() { th.join(); } - // Verify value of max seen index + // Verify value of max seen index based on counted queue size uint64_t maxId = maxIndexSeen.load(); - EXPECT_EQ(maxId, (16384 + 5) * kThreads - 1); + EXPECT_EQ(maxId, (countedQueueSize.load() + 5) * kThreads - 1); hsa_amd_memory_pool_free(shared_src_buffer); } \ No newline at end of file diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/counted_queue_manager.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/counted_queue_manager.h index e7b05dc318..5d12866b55 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/counted_queue_manager.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/counted_queue_manager.h @@ -53,6 +53,7 @@ class CountedQueuePoolManager { core::Agent* agent_; // pointer to the gpu agent that owns this pool uint32_t max_hw_queues_; + size_t counted_queue_size_; std::mutex mutex_; // Pool of hw queues by priority on the agent diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/counted_queue_manager.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/counted_queue_manager.cpp index 675129e776..4b4edfa21f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/counted_queue_manager.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/counted_queue_manager.cpp @@ -11,11 +11,10 @@ namespace rocr { namespace core { -constexpr size_t DEFAULT_QUEUE_SIZE = 16384; - CountedQueuePoolManager::CountedQueuePoolManager(core::Agent* agent) : agent_(agent) { - // Read in GPU_MAX_HW_QUEUES flag value + // Read in GPU_MAX_HW_QUEUES and HSA_COUNTED_QUEUE_SIZE flags max_hw_queues_ = core::Runtime::runtime_singleton_->flag().cp_queues_limit(); + counted_queue_size_ = core::Runtime::runtime_singleton_->flag().counted_queue_size(); } hsa_status_t CountedQueuePoolManager::AcquireQueue( @@ -78,7 +77,7 @@ core::Queue* CountedQueuePoolManager::FindOrCreateHardwareQueue( // Create a new hardware queue core::Queue* cmd_queue = nullptr; hsa_status_t status = - agent_->QueueCreate(DEFAULT_QUEUE_SIZE, type, 0, callback, data, 0, 0, &cmd_queue); + agent_->QueueCreate(counted_queue_size_, type, 0, callback, data, 0, 0, &cmd_queue); if (status != HSA_STATUS_SUCCESS) return nullptr; status = cmd_queue->SetPriority(priority); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h index 4a283b9e12..126c7c52a9 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/flag.h @@ -54,6 +54,9 @@ namespace rocr { +constexpr size_t DEFAULT_COUNTED_QUEUE_SIZE = 16384; +constexpr uint32_t DEFAULT_GPU_HW_QUEUES_MAX = 4; + class Flag { public: enum SDMA_OVERRIDE { SDMA_DISABLE, SDMA_ENABLE, SDMA_DEFAULT }; @@ -306,8 +309,16 @@ class Flag { core_dump_disable_ = (var == "1"); core_dump_pattern_ = os::GetEnvVar("HSA_COREDUMP_PATTERN"); + + // This limits the maximum number of hardware queues that can be created per + // priority level for counted queues on every GPU agent. By default, the limit is set to 4. var = os::GetEnvVar("GPU_MAX_HW_QUEUES"); - cp_queues_limit_ = var.empty() ? 4 : atoi(var.c_str()); + cp_queues_limit_ = var.empty() ? DEFAULT_GPU_HW_QUEUES_MAX : atoi(var.c_str()); + + // This allows configuring the size of counted queues created through + // hsa_amd_counted_queue_acquire API. If not set, default queue size is set to 16384. + var = os::GetEnvVar("HSA_COUNTED_QUEUE_SIZE"); + counted_queue_size_ = var.empty() ? DEFAULT_COUNTED_QUEUE_SIZE : atoi(var.c_str()); } void parse_masks(uint32_t maxGpu, uint32_t maxCU) { @@ -430,6 +441,8 @@ class Flag { uint32_t cp_queues_limit() const { return cp_queues_limit_; } + size_t counted_queue_size() const { return counted_queue_size_; } + bool dev_mem_queue_buf() const { return dev_mem_queue_buf_; } uint32_t signal_abort_timeout() const { return signal_abort_timeout_; } @@ -550,6 +563,7 @@ class Flag { std::string core_dump_pattern_; uint32_t cp_queues_limit_; + size_t counted_queue_size_; // Map GPU index post RVD to its default cu mask. std::map> cu_mask_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h index bcf9a77021..9ba159686f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -3793,6 +3793,7 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi * @param[in] agent Agent where to create the queue * * @param[in] type For future use. HSA_QUEUE_TYPE_MULTI is the only valid option. + * HSA_QUEUE_TYPE_COOPERATIVE queues are not supported. * * @param[in] priority Associated priority. The GPU_MAX_HW_QUEUES limit is counted for each priority *