SWDEV-561708 Initial shared queue pool apis (#1614)
* SWDEV-561708 Initial shared queue pool apis * Validate params; some fixes in callback function (but still needs to be checked) * Dtor cleanup * minor * Enable profiling; remove callback since aql_queue takes care of it * setPriority and setCuMask APIs updated for counted queues * Increasing step and minor version for rocprofiler * Tests for CountedQueueManager * tests * Code refactored to make pool manager part of GpuAgent only (incomplete); unique handles issue pending * Refactored code to support CQM inside GpuAgent and unique handles; multithreaded test added * Changed to ASSERT_SUCCESS macros for all tests * RIng buffer overflow test added * tests fixed; cleanup added at hsa_shutdown * priority conversion table changes * Compiler warnings fixed * Rewrite 1 test; add desc and improve SetUp() code * Improvement * Unififed getinfo for both counted and non-counted queues * Address PR feedback * Addressing feedback: memleak, data type mismatch, documentation * improve comment * format * Missing HSA_API macros for roctracer * Revert "Addressing feedback: memleak, data type mismatch, documentation" This reverts commit 5e498a55fb3640e00d06cec63dcec79293fb23de. * Improving acquire api doc * release api doc improved * error codes for release api doc
This commit is contained in:
@@ -0,0 +1,855 @@
|
||||
/*
|
||||
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <atomic>
|
||||
#include <unordered_map>
|
||||
#include <algorithm>
|
||||
|
||||
#include "suites/functional/counted_queues.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "common/os.h"
|
||||
|
||||
|
||||
static bool VerifyResult(uint32_t* ar, size_t sz) {
|
||||
for (size_t i = 0; i < sz; ++i) {
|
||||
if (i * i != ar[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
CountedQueuesTest::CountedQueuesTest() : TestBase() {
|
||||
set_title("RocR Counted Queues Test");
|
||||
set_description(
|
||||
"This test validates the behavior of Shared Counted Queues managed by the "
|
||||
"Counted Queue Manager in a scenario where different libraries use CP "
|
||||
"Queues and it avoids oversubscription and a subsequent performance degradation.");
|
||||
}
|
||||
|
||||
CountedQueuesTest::~CountedQueuesTest() {}
|
||||
|
||||
void CountedQueuesTest::SetUp() {
|
||||
const std::string kDefaultLimit = "2";
|
||||
static const std::unordered_map<std::string, std::string> kQueueLimits = {
|
||||
{"Counted_Queue_Multithreaded_Dispatch_Test", "1"},
|
||||
{"Counted_Queue_Overflow_And_Wraparound_Test", "1"},
|
||||
{"Counted_Queue_Same_Priority_Max_Limit_Test", "4"}};
|
||||
|
||||
const ::testing::TestInfo* test_info = ::testing::UnitTest::GetInstance()->current_test_info();
|
||||
if (test_info) {
|
||||
const std::string test_name = test_info->name();
|
||||
|
||||
// Find the current test's required limit from map and set the env var
|
||||
// Set default HW queue limit if not found in map
|
||||
auto it = kQueueLimits.find(test_name);
|
||||
const std::string& limit = (it != kQueueLimits.end()) ? it->second : kDefaultLimit;
|
||||
rocrtst::SetEnv("GPU_MAX_HW_QUEUES", limit.c_str());
|
||||
}
|
||||
|
||||
TestBase::SetUp();
|
||||
}
|
||||
|
||||
void CountedQueuesTest::Run() {
|
||||
// Compare required profile for this test case with what we're actually
|
||||
// running on
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
TestBase::Run();
|
||||
}
|
||||
|
||||
void CountedQueuesTest::Close() {
|
||||
// This will close handles opened within rocrtst utility calls and call
|
||||
// hsa_shut_down(), so it should be done after other hsa cleanup
|
||||
TestBase::Close();
|
||||
}
|
||||
|
||||
void CountedQueuesTest::DisplayResults() const {
|
||||
// Compare required profile for this test case with what we're actually
|
||||
// running on
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
TestBase::DisplayResults();
|
||||
}
|
||||
|
||||
void CountedQueuesTest::DisplayTestInfo() { TestBase::DisplayTestInfo(); }
|
||||
|
||||
void CountedQueuesTest::CountedQueueBasicApiTest() {
|
||||
// Find all gpu agents
|
||||
std::vector<hsa_agent_t> gpus;
|
||||
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
|
||||
|
||||
hsa_queue_t* queue = nullptr;
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
|
||||
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, &queue));
|
||||
ASSERT_NE(queue, nullptr);
|
||||
|
||||
// Query counted queue and check internal reference count
|
||||
int32_t use_count = 0;
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count));
|
||||
EXPECT_EQ(use_count, 1); // should be 1 after acquire
|
||||
|
||||
// Release the queue
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
|
||||
|
||||
// Check that ref count is back to 0 after release
|
||||
hsa_status_t status;
|
||||
status = hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count);
|
||||
ASSERT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
|
||||
}
|
||||
|
||||
void CountedQueuesTest::CountedQueues_SamePriority_MaxLimitTest() {
|
||||
hsa_status_t status;
|
||||
|
||||
// Find all GPU agents
|
||||
std::vector<hsa_agent_t> gpus;
|
||||
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
|
||||
ASSERT_FALSE(gpus.empty());
|
||||
|
||||
const int NUM_QUEUES = 50;
|
||||
const int MAX_HW_QUEUES = std::stoi(rocrtst::GetEnv("GPU_MAX_HW_QUEUES"));
|
||||
|
||||
std::vector<hsa_queue_t*> queues(NUM_QUEUES, nullptr);
|
||||
std::vector<uint32_t> hw_ids(NUM_QUEUES, 0);
|
||||
|
||||
// Acquire NUM_QUEUES counted queues
|
||||
for (int i = 0; i < NUM_QUEUES; i++) {
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI,
|
||||
HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
|
||||
&queues[i]));
|
||||
ASSERT_NE(queues[i], nullptr);
|
||||
}
|
||||
|
||||
// Query HW IDs
|
||||
for (int i = 0; i < NUM_QUEUES; i++) {
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(queues[i], HSA_QUEUE_INFO_HW_ID, &hw_ids[i]));
|
||||
}
|
||||
|
||||
// Sort and remove duplicate HW IDs
|
||||
std::sort(hw_ids.begin(), hw_ids.end());
|
||||
auto it = std::unique(hw_ids.begin(), hw_ids.end());
|
||||
hw_ids.resize(std::distance(hw_ids.begin(), it));
|
||||
|
||||
// Ensure hardware queue count matches MAX_HW_QUEUES
|
||||
ASSERT_EQ(hw_ids.size(), MAX_HW_QUEUES);
|
||||
|
||||
// Verify even distribution of logical queues over HW queues
|
||||
// Map HW ID -> use count
|
||||
std::unordered_map<uint32_t, uint32_t> use_counts;
|
||||
|
||||
for (auto* q : queues) {
|
||||
uint32_t hwid = 0, count = 0;
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(q, HSA_QUEUE_INFO_HW_ID, &hwid));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(q, HSA_QUEUE_INFO_USE_COUNT, &count));
|
||||
use_counts[hwid] = count; // overwrites but counts are per-hw, same across queues
|
||||
}
|
||||
|
||||
// Gather all use-counts for fairness check
|
||||
std::vector<uint32_t> dist;
|
||||
for (auto& kv : use_counts) {
|
||||
dist.push_back(kv.second);
|
||||
}
|
||||
|
||||
ASSERT_EQ(dist.size(), MAX_HW_QUEUES);
|
||||
|
||||
// Fair distribution: difference should not exceed 1
|
||||
auto [min_it, max_it] = std::minmax_element(dist.begin(), dist.end());
|
||||
EXPECT_LE(*max_it - *min_it, 1);
|
||||
|
||||
// Release queues
|
||||
for (auto* q : queues) {
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(q));
|
||||
}
|
||||
|
||||
// After release, querying use-count should return invalid argument
|
||||
for (auto* q : queues) {
|
||||
uint32_t tmp = 0;
|
||||
EXPECT_EQ(hsa_amd_queue_get_info(q, HSA_QUEUE_INFO_USE_COUNT, &tmp),
|
||||
HSA_STATUS_ERROR_INVALID_ARGUMENT);
|
||||
}
|
||||
}
|
||||
|
||||
void CountedQueuesTest::InvalidArgsTest() {
|
||||
hsa_status_t status;
|
||||
hsa_queue_t* q = nullptr;
|
||||
|
||||
// Find all gpu agents
|
||||
std::vector<hsa_agent_t> gpus;
|
||||
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
|
||||
|
||||
// Invalid queue pointer
|
||||
status = hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW,
|
||||
nullptr, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
|
||||
|
||||
// Invalid priority
|
||||
const hsa_amd_queue_priority_t invalid_priority = static_cast<hsa_amd_queue_priority_t>(999);
|
||||
status = hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI, invalid_priority, nullptr,
|
||||
nullptr, 0, &q);
|
||||
EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
|
||||
|
||||
// Support multi producer queues only
|
||||
status = hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_SINGLE, HSA_AMD_QUEUE_PRIORITY_LOW,
|
||||
nullptr, nullptr, 0, &q);
|
||||
EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_QUEUE_CREATION);
|
||||
|
||||
// Check release API params
|
||||
hsa_queue_t* queue = nullptr;
|
||||
status = hsa_amd_counted_queue_release(queue);
|
||||
EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
|
||||
}
|
||||
|
||||
void CountedQueuesTest::CountedQueuesAllPrioritiesLimitTest() {
|
||||
hsa_status_t status;
|
||||
|
||||
// Find all gpu agents
|
||||
std::vector<hsa_agent_t> gpus;
|
||||
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
|
||||
|
||||
// Acquire 2 queues per priority (total 6 queues)
|
||||
hsa_queue_t *low1 = nullptr, *low2 = nullptr, *low3 = nullptr;
|
||||
hsa_queue_t *normal1 = nullptr, *normal2 = nullptr, *normal3 = nullptr;
|
||||
hsa_queue_t *high1 = nullptr, *high2 = nullptr, *high3 = nullptr;
|
||||
|
||||
// Low Priority
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
|
||||
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &low1));
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
|
||||
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &low2));
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI,
|
||||
HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
|
||||
&low3)); // should reuse low1
|
||||
|
||||
// Normal Priority
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
|
||||
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, &normal1));
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
|
||||
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0, &normal2));
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(gpus[0], HSA_QUEUE_TYPE_MULTI,
|
||||
HSA_AMD_QUEUE_PRIORITY_NORMAL, nullptr, nullptr, 0,
|
||||
&normal3)); // should reuse normal1
|
||||
|
||||
// High Priority
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
|
||||
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_HIGH, nullptr, nullptr, 0, &high1));
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
|
||||
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_HIGH, nullptr, nullptr, 0, &high2));
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
|
||||
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_HIGH, nullptr, nullptr, 0, &high3));
|
||||
|
||||
// Verify reuse and independence per priority
|
||||
uint32_t low_id1 = 0, low_id2 = 0, low_id3 = 0;
|
||||
uint32_t norm_id1 = 0, norm_id2 = 0, norm_id3 = 0;
|
||||
uint32_t high_id1 = 0, high_id2 = 0, high_id3 = 0;
|
||||
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(low1, HSA_QUEUE_INFO_HW_ID, &low_id1));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(low2, HSA_QUEUE_INFO_HW_ID, &low_id2));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(low3, HSA_QUEUE_INFO_HW_ID, &low_id3));
|
||||
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(normal1, HSA_QUEUE_INFO_HW_ID, &norm_id1));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(normal2, HSA_QUEUE_INFO_HW_ID, &norm_id2));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(normal3, HSA_QUEUE_INFO_HW_ID, &norm_id3));
|
||||
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(high1, HSA_QUEUE_INFO_HW_ID, &high_id1));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(high2, HSA_QUEUE_INFO_HW_ID, &high_id2));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(high3, HSA_QUEUE_INFO_HW_ID, &high_id3));
|
||||
|
||||
// Within same priority: max 2 unique HW queues
|
||||
EXPECT_NE(low_id1, low_id2);
|
||||
EXPECT_TRUE(low_id3 == low_id1);
|
||||
|
||||
EXPECT_NE(norm_id1, norm_id2);
|
||||
EXPECT_TRUE(norm_id3 == norm_id1);
|
||||
|
||||
EXPECT_NE(high_id1, high_id2);
|
||||
EXPECT_TRUE(high_id3 == high_id1);
|
||||
|
||||
// Ensure different queues are used across priorities
|
||||
EXPECT_NE(low_id1, norm_id1);
|
||||
EXPECT_NE(norm_id1, high_id1);
|
||||
EXPECT_NE(low_id1, high_id1);
|
||||
|
||||
// Verify use counts of first two HW queues
|
||||
uint32_t low_use1 = 0, low_use2 = 0, low_use3 = 0;
|
||||
uint32_t norm_use1 = 0, norm_use2 = 0, norm_use3 = 0;
|
||||
uint32_t high_use1 = 0, high_use2 = 0, high_use3 = 0;
|
||||
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(low1, HSA_QUEUE_INFO_USE_COUNT, &low_use1));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(low2, HSA_QUEUE_INFO_USE_COUNT, &low_use2));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(low3, HSA_QUEUE_INFO_USE_COUNT, &low_use3));
|
||||
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(normal1, HSA_QUEUE_INFO_USE_COUNT, &norm_use1));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(normal2, HSA_QUEUE_INFO_USE_COUNT, &norm_use2));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(normal3, HSA_QUEUE_INFO_USE_COUNT, &norm_use3));
|
||||
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(high1, HSA_QUEUE_INFO_USE_COUNT, &high_use1));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(high2, HSA_QUEUE_INFO_USE_COUNT, &high_use2));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(high3, HSA_QUEUE_INFO_USE_COUNT, &high_use3));
|
||||
|
||||
EXPECT_EQ(low_use1, 2);
|
||||
EXPECT_EQ(low_use2, 1);
|
||||
EXPECT_TRUE(low_use1 == low_use3); // same HW queues, same ref count
|
||||
|
||||
EXPECT_EQ(norm_use1, 2);
|
||||
EXPECT_EQ(norm_use2, 1);
|
||||
EXPECT_TRUE(norm_use1 == norm_use3);
|
||||
|
||||
EXPECT_EQ(high_use1, 2);
|
||||
EXPECT_EQ(high_use2, 1);
|
||||
EXPECT_TRUE(high_use1 == high_use3);
|
||||
|
||||
// Release all queues
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(low1));
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(low2));
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(low3));
|
||||
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(normal1));
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(normal2));
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(normal3));
|
||||
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(high1));
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(high2));
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(high3));
|
||||
}
|
||||
|
||||
void CountedQueuesTest::CountedQueuesSetPriorityNackTest() {
|
||||
hsa_status_t status;
|
||||
|
||||
// Find all gpu agents
|
||||
std::vector<hsa_agent_t> gpus;
|
||||
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
|
||||
|
||||
// Create a counted queue
|
||||
hsa_queue_t* queue = nullptr;
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
|
||||
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &queue));
|
||||
EXPECT_NE(queue, nullptr);
|
||||
|
||||
// Try to set priority on this queue; should fail
|
||||
status = hsa_amd_queue_set_priority(queue, HSA_AMD_QUEUE_PRIORITY_HIGH);
|
||||
EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_QUEUE);
|
||||
|
||||
// release queue
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
|
||||
}
|
||||
|
||||
void CountedQueuesTest::CountedQueuesSetCUMaskNackTest() {
|
||||
hsa_status_t status;
|
||||
|
||||
// Find all gpu agents
|
||||
std::vector<hsa_agent_t> gpus;
|
||||
ASSERT_SUCCESS(hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus));
|
||||
|
||||
// Create a counted queue
|
||||
hsa_queue_t* queue = nullptr;
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(
|
||||
gpus[0], HSA_QUEUE_TYPE_MULTI, HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0, &queue));
|
||||
EXPECT_NE(queue, nullptr);
|
||||
|
||||
// Attempt to set CU mask on counted queue; should fail
|
||||
uint32_t cu_mask[32] = {0}; // dummy mask
|
||||
status = hsa_amd_queue_cu_set_mask(queue, 1, cu_mask);
|
||||
EXPECT_EQ(status, HSA_STATUS_ERROR_INVALID_QUEUE);
|
||||
|
||||
// release queue
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
|
||||
}
|
||||
|
||||
void CountedQueuesTest::CountedQueuesDispatchTest() {
|
||||
hsa_status_t status;
|
||||
|
||||
// Common setup
|
||||
ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this));
|
||||
ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this));
|
||||
|
||||
// Load kernel
|
||||
set_kernel_file_name("test_case_template_kernels.hsaco");
|
||||
set_kernel_name("square");
|
||||
ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, gpu_device1()));
|
||||
|
||||
hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()};
|
||||
|
||||
// Allocate source buffer
|
||||
void* src_buffer = nullptr;
|
||||
ASSERT_SUCCESS(hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &src_buffer));
|
||||
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, src_buffer));
|
||||
|
||||
// Initialize source data
|
||||
for (uint32_t i = 0; i < 256; ++i) {
|
||||
reinterpret_cast<uint32_t*>(src_buffer)[i] = i;
|
||||
}
|
||||
|
||||
// Allocate destination buffer
|
||||
void* dst_buffer = nullptr;
|
||||
ASSERT_SUCCESS(hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &dst_buffer));
|
||||
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, dst_buffer));
|
||||
|
||||
// Create completion signal
|
||||
hsa_signal_t completion_signal;
|
||||
ASSERT_SUCCESS(hsa_signal_create(1, 0, nullptr, &completion_signal));
|
||||
|
||||
// Get a counted queue
|
||||
hsa_queue_t* queue = nullptr;
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(*gpu_device1(), HSA_QUEUE_TYPE_MULTI,
|
||||
HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
|
||||
&queue));
|
||||
EXPECT_NE(queue, nullptr);
|
||||
|
||||
// Query queue info
|
||||
int32_t use_count = 0;
|
||||
uint32_t hw_id = 0;
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_HW_ID, &hw_id));
|
||||
EXPECT_EQ(use_count, 1);
|
||||
|
||||
// Prepare kernel arguments
|
||||
struct __attribute__((aligned(16))) local_args_t {
|
||||
uint32_t* dstArray;
|
||||
uint32_t* srcArray;
|
||||
uint32_t size;
|
||||
uint32_t pad;
|
||||
uint64_t global_offset_x;
|
||||
uint64_t global_offset_y;
|
||||
uint64_t global_offset_z;
|
||||
uint64_t printf_buffer;
|
||||
uint64_t default_queue;
|
||||
uint64_t completion_action;
|
||||
} local_args;
|
||||
|
||||
local_args.dstArray = reinterpret_cast<uint32_t*>(dst_buffer);
|
||||
local_args.srcArray = reinterpret_cast<uint32_t*>(src_buffer);
|
||||
local_args.size = 256;
|
||||
local_args.global_offset_x = 0;
|
||||
local_args.global_offset_y = 0;
|
||||
local_args.global_offset_z = 0;
|
||||
local_args.printf_buffer = 0;
|
||||
local_args.default_queue = 0;
|
||||
local_args.completion_action = 0;
|
||||
|
||||
// Allocate kernel arguments
|
||||
void* kernarg_address = nullptr;
|
||||
ASSERT_SUCCESS(
|
||||
hsa_amd_memory_pool_allocate(kern_arg_pool(), sizeof(local_args), 0, &kernarg_address));
|
||||
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, kernarg_address));
|
||||
memcpy(kernarg_address, &local_args, sizeof(local_args));
|
||||
|
||||
// Dispatch loop
|
||||
int it = num_iteration() * 5;
|
||||
const uint32_t queue_mask = queue->size - 1;
|
||||
|
||||
for (int i = 0; i < it; i++) {
|
||||
// Reserve a slot in the queue
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1);
|
||||
|
||||
// Get pointer to the reserved packet slot
|
||||
hsa_kernel_dispatch_packet_t* queue_aql_packet =
|
||||
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address))[index & queue_mask];
|
||||
|
||||
// Fill packet fields
|
||||
queue_aql_packet->setup = 1;
|
||||
queue_aql_packet->workgroup_size_x = 256;
|
||||
queue_aql_packet->workgroup_size_y = 1;
|
||||
queue_aql_packet->workgroup_size_z = 1;
|
||||
queue_aql_packet->grid_size_x = 256;
|
||||
queue_aql_packet->grid_size_y = 1;
|
||||
queue_aql_packet->grid_size_z = 1;
|
||||
queue_aql_packet->private_segment_size = 0;
|
||||
queue_aql_packet->group_segment_size = 0;
|
||||
queue_aql_packet->kernel_object = kernel_object();
|
||||
queue_aql_packet->kernarg_address = kernarg_address;
|
||||
queue_aql_packet->completion_signal = completion_signal;
|
||||
|
||||
// Write header for packet
|
||||
uint32_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
||||
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
||||
__atomic_store_n(reinterpret_cast<uint16_t*>(&queue_aql_packet->header), header,
|
||||
__ATOMIC_RELEASE);
|
||||
|
||||
// Ring doorbell to notify GPU
|
||||
hsa_signal_store_screlease(queue->doorbell_signal, index);
|
||||
|
||||
// Wait for completion signal
|
||||
while (hsa_signal_wait_scacquire(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
|
||||
HSA_WAIT_STATE_ACTIVE)) {
|
||||
}
|
||||
|
||||
// Reset signal for next iteration
|
||||
hsa_signal_store_screlease(completion_signal, 1);
|
||||
|
||||
// Verify results
|
||||
ASSERT_TRUE(VerifyResult(reinterpret_cast<uint32_t*>(dst_buffer), 256));
|
||||
}
|
||||
|
||||
// Verify use count before release
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count));
|
||||
EXPECT_EQ(use_count, 1);
|
||||
|
||||
// Release the counted queue
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
|
||||
|
||||
// Verify queue info returns error after release
|
||||
status = hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &use_count);
|
||||
ASSERT_EQ(status, HSA_STATUS_ERROR_INVALID_ARGUMENT);
|
||||
|
||||
// Cleanup
|
||||
ASSERT_SUCCESS(hsa_amd_memory_pool_free(kernarg_address));
|
||||
ASSERT_SUCCESS(hsa_signal_destroy(completion_signal));
|
||||
ASSERT_SUCCESS(hsa_amd_memory_pool_free(src_buffer));
|
||||
ASSERT_SUCCESS(hsa_amd_memory_pool_free(dst_buffer));
|
||||
}
|
||||
|
||||
void CountedQueuesTest::CountedQueuesMultithreadedDispatchTest() {
|
||||
hsa_status_t status;
|
||||
|
||||
// Common setup
|
||||
ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this));
|
||||
ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this));
|
||||
|
||||
// Load kernel
|
||||
set_kernel_file_name("test_case_template_kernels.hsaco");
|
||||
set_kernel_name("square");
|
||||
ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, gpu_device1()));
|
||||
|
||||
hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()};
|
||||
|
||||
// Shared source buffer (read-only)
|
||||
void* shared_src_buffer = nullptr;
|
||||
ASSERT_SUCCESS(
|
||||
hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &shared_src_buffer));
|
||||
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, shared_src_buffer));
|
||||
|
||||
// Initialize source data
|
||||
for (uint32_t i = 0; i < 256; ++i) {
|
||||
reinterpret_cast<uint32_t*>(shared_src_buffer)[i] = i;
|
||||
}
|
||||
|
||||
// Structures for validation later on
|
||||
std::mutex hwIdsMutex;
|
||||
std::vector<uint32_t> allHwIds;
|
||||
std::atomic<int32_t> maxUseCount{0};
|
||||
|
||||
auto func = [&]() {
|
||||
// local dest buffer for each user application
|
||||
void* local_dst_buffer = nullptr;
|
||||
ASSERT_SUCCESS(
|
||||
hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &local_dst_buffer));
|
||||
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, local_dst_buffer));
|
||||
|
||||
// Local completion signal for every user application
|
||||
hsa_signal_t local_signal;
|
||||
ASSERT_SUCCESS(hsa_signal_create(1, 0, nullptr, &local_signal));
|
||||
|
||||
// Get a counted queue
|
||||
hsa_queue_t* queue = nullptr;
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(*gpu_device1(), HSA_QUEUE_TYPE_MULTI,
|
||||
HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
|
||||
&queue));
|
||||
EXPECT_NE(queue, nullptr);
|
||||
|
||||
if (queue == nullptr) {
|
||||
hsa_signal_destroy(local_signal);
|
||||
hsa_amd_memory_pool_free(local_dst_buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
// Store query results for later analysis
|
||||
int32_t localUseCount = 0;
|
||||
uint32_t localHwId = 0;
|
||||
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_USE_COUNT, &localUseCount));
|
||||
ASSERT_SUCCESS(hsa_amd_queue_get_info(queue, HSA_QUEUE_INFO_HW_ID, &localHwId));
|
||||
|
||||
// Update use_count if it is larger than previous value
|
||||
int expected = maxUseCount.load();
|
||||
while (localUseCount > expected &&
|
||||
!maxUseCount.compare_exchange_weak(expected, localUseCount)) {
|
||||
}
|
||||
|
||||
// Store hw id for validation later on
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(hwIdsMutex);
|
||||
allHwIds.push_back(localHwId);
|
||||
}
|
||||
|
||||
struct __attribute__((aligned(16))) local_args_t {
|
||||
uint32_t* dstArray;
|
||||
uint32_t* srcArray;
|
||||
uint32_t size;
|
||||
uint32_t pad;
|
||||
uint64_t global_offset_x;
|
||||
uint64_t global_offset_y;
|
||||
uint64_t global_offset_z;
|
||||
uint64_t printf_buffer;
|
||||
uint64_t default_queue;
|
||||
uint64_t completion_action;
|
||||
} local_args;
|
||||
|
||||
local_args.dstArray = reinterpret_cast<uint32_t*>(local_dst_buffer);
|
||||
local_args.srcArray = reinterpret_cast<uint32_t*>(shared_src_buffer);
|
||||
local_args.size = 256;
|
||||
local_args.global_offset_x = 0;
|
||||
local_args.global_offset_y = 0;
|
||||
local_args.global_offset_z = 0;
|
||||
local_args.printf_buffer = 0;
|
||||
local_args.default_queue = 0;
|
||||
local_args.completion_action = 0;
|
||||
|
||||
void* kernarg_address = nullptr;
|
||||
ASSERT_SUCCESS(
|
||||
hsa_amd_memory_pool_allocate(kern_arg_pool(), sizeof(local_args), 0, &kernarg_address));
|
||||
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, kernarg_address));
|
||||
|
||||
memcpy(kernarg_address, &local_args, sizeof(local_args));
|
||||
|
||||
// Dispatch loop
|
||||
int it = num_iteration() * 5;
|
||||
const uint32_t queue_mask = queue->size - 1;
|
||||
|
||||
for (int i = 0; i < it; i++) {
|
||||
// Reserve a slot in the queue
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1);
|
||||
|
||||
// Get pointer to the reserved packet slot and validate address
|
||||
hsa_kernel_dispatch_packet_t* queue_aql_packet = &(
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address))[index & queue_mask];
|
||||
ASSERT_EQ(queue_aql_packet,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address) + (index & queue_mask));
|
||||
|
||||
// Fill packet fields
|
||||
queue_aql_packet->setup = 1;
|
||||
queue_aql_packet->workgroup_size_x = 256;
|
||||
queue_aql_packet->workgroup_size_y = 1;
|
||||
queue_aql_packet->workgroup_size_z = 1;
|
||||
queue_aql_packet->grid_size_x = 256;
|
||||
queue_aql_packet->grid_size_y = 1;
|
||||
queue_aql_packet->grid_size_z = 1;
|
||||
queue_aql_packet->private_segment_size = 0;
|
||||
queue_aql_packet->group_segment_size = 0;
|
||||
queue_aql_packet->kernel_object = kernel_object();
|
||||
queue_aql_packet->kernarg_address = kernarg_address;
|
||||
queue_aql_packet->completion_signal = local_signal;
|
||||
|
||||
// Write header for packet
|
||||
uint32_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
||||
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
||||
__atomic_store_n(reinterpret_cast<uint16_t*>(&queue_aql_packet->header), header,
|
||||
__ATOMIC_RELEASE);
|
||||
|
||||
// Ring doorbell to notify GPU
|
||||
hsa_signal_store_screlease(queue->doorbell_signal, index);
|
||||
|
||||
// Wait for completion signal to be less than 1
|
||||
while (hsa_signal_wait_scacquire(local_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
|
||||
HSA_WAIT_STATE_ACTIVE)) {
|
||||
}
|
||||
|
||||
// Reset signal for next iteration
|
||||
hsa_signal_store_screlease(local_signal, 1);
|
||||
|
||||
ASSERT_TRUE(VerifyResult(reinterpret_cast<uint32_t*>(local_dst_buffer), 256));
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
hsa_amd_memory_pool_free(kernarg_address);
|
||||
hsa_signal_destroy(local_signal);
|
||||
hsa_amd_memory_pool_free(local_dst_buffer);
|
||||
|
||||
// Release the counted queue
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
|
||||
};
|
||||
|
||||
constexpr int kThreads = 2;
|
||||
std::vector<std::thread> threads;
|
||||
for (int i = 0; i < kThreads; i++) {
|
||||
threads.emplace_back(func);
|
||||
}
|
||||
|
||||
for (auto& th : threads) {
|
||||
th.join();
|
||||
}
|
||||
|
||||
// With GPU_MAX_HW_QUEUES=1, all threads should share the same HW queue
|
||||
// Check if largest useCount is same as the number of user apps accessing queues
|
||||
EXPECT_EQ(maxUseCount.load(), kThreads);
|
||||
|
||||
// All HW IDs should be the same (only 1 HW queue created)
|
||||
EXPECT_EQ(allHwIds.size(), static_cast<size_t>(kThreads));
|
||||
for (size_t i = 1; i < allHwIds.size(); i++) {
|
||||
EXPECT_EQ(allHwIds[i], allHwIds[0]);
|
||||
}
|
||||
|
||||
hsa_amd_memory_pool_free(shared_src_buffer);
|
||||
}
|
||||
|
||||
void CountedQueuesTest::CountedQueuesOverflowWrapAroundTest() {
|
||||
hsa_status_t status;
|
||||
|
||||
// Common setup
|
||||
ASSERT_SUCCESS(rocrtst::SetDefaultAgents(this));
|
||||
ASSERT_SUCCESS(rocrtst::SetPoolsTypical(this));
|
||||
|
||||
// Load kernel
|
||||
set_kernel_file_name("test_case_template_kernels.hsaco");
|
||||
set_kernel_name("square");
|
||||
ASSERT_SUCCESS(rocrtst::LoadKernelFromObjFile(this, gpu_device1()));
|
||||
|
||||
hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()};
|
||||
|
||||
void* shared_src_buffer = nullptr;
|
||||
ASSERT_SUCCESS(
|
||||
hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &shared_src_buffer));
|
||||
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, shared_src_buffer));
|
||||
|
||||
for (uint32_t i = 0; i < 256; ++i) {
|
||||
reinterpret_cast<uint32_t*>(shared_src_buffer)[i] = i;
|
||||
}
|
||||
|
||||
// To verify that after the queue has been used up, next index wraps around
|
||||
std::atomic<uint64_t> maxIndexSeen{0};
|
||||
|
||||
auto func = [&]() {
|
||||
// local dest buffer for each user application
|
||||
void* local_dst_buffer = nullptr;
|
||||
ASSERT_SUCCESS(
|
||||
hsa_amd_memory_pool_allocate(cpu_pool(), 256 * sizeof(uint32_t), 0, &local_dst_buffer));
|
||||
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, local_dst_buffer));
|
||||
|
||||
// Local completion signal for every user application
|
||||
hsa_signal_t local_signal;
|
||||
ASSERT_SUCCESS(hsa_signal_create(1, 0, nullptr, &local_signal));
|
||||
|
||||
// Get a counted queue
|
||||
hsa_queue_t* queue = nullptr;
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_acquire(*gpu_device1(), HSA_QUEUE_TYPE_MULTI,
|
||||
HSA_AMD_QUEUE_PRIORITY_LOW, nullptr, nullptr, 0,
|
||||
&queue));
|
||||
EXPECT_NE(queue, nullptr);
|
||||
|
||||
if (queue == nullptr) {
|
||||
hsa_signal_destroy(local_signal);
|
||||
hsa_amd_memory_pool_free(local_dst_buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t queue_size = queue->size; // should be 16384
|
||||
const uint32_t queue_mask = queue_size - 1; // used for index wraparound
|
||||
|
||||
struct __attribute__((aligned(16))) local_args_t {
|
||||
uint32_t* dstArray;
|
||||
uint32_t* srcArray;
|
||||
uint32_t size;
|
||||
uint32_t pad;
|
||||
uint64_t global_offset_x;
|
||||
uint64_t global_offset_y;
|
||||
uint64_t global_offset_z;
|
||||
uint64_t printf_buffer;
|
||||
uint64_t default_queue;
|
||||
uint64_t completion_action;
|
||||
} local_args;
|
||||
|
||||
local_args.dstArray = reinterpret_cast<uint32_t*>(local_dst_buffer);
|
||||
local_args.srcArray = reinterpret_cast<uint32_t*>(shared_src_buffer);
|
||||
local_args.size = 256;
|
||||
local_args.global_offset_x = 0;
|
||||
local_args.global_offset_y = 0;
|
||||
local_args.global_offset_z = 0;
|
||||
local_args.printf_buffer = 0;
|
||||
local_args.default_queue = 0;
|
||||
local_args.completion_action = 0;
|
||||
|
||||
void* kernarg_address = nullptr;
|
||||
ASSERT_SUCCESS(
|
||||
hsa_amd_memory_pool_allocate(kern_arg_pool(), sizeof(local_args), 0, &kernarg_address));
|
||||
ASSERT_SUCCESS(hsa_amd_agents_allow_access(2, ag_list, NULL, kernarg_address));
|
||||
|
||||
memcpy(kernarg_address, &local_args, sizeof(local_args));
|
||||
|
||||
// Dispatch more packets than queue size to force overflow and ensure that indices wrap around
|
||||
int it = queue_size + 5;
|
||||
|
||||
for (int i = 0; i < it; i++) {
|
||||
// Reserve a slot in the queue
|
||||
uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1);
|
||||
|
||||
uint64_t curr_max = maxIndexSeen.load();
|
||||
while (index > curr_max && !maxIndexSeen.compare_exchange_weak(curr_max, index)) {
|
||||
}
|
||||
|
||||
// Get pointer to the reserved packet slot using wraparound masking
|
||||
uint64_t wrapped_index = index & queue_mask;
|
||||
hsa_kernel_dispatch_packet_t* queue_aql_packet =
|
||||
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address))[wrapped_index];
|
||||
|
||||
// Fill packet fields
|
||||
queue_aql_packet->setup = 1;
|
||||
queue_aql_packet->workgroup_size_x = 256;
|
||||
queue_aql_packet->workgroup_size_y = 1;
|
||||
queue_aql_packet->workgroup_size_z = 1;
|
||||
queue_aql_packet->grid_size_x = 256;
|
||||
queue_aql_packet->grid_size_y = 1;
|
||||
queue_aql_packet->grid_size_z = 1;
|
||||
queue_aql_packet->private_segment_size = 0;
|
||||
queue_aql_packet->group_segment_size = 0;
|
||||
queue_aql_packet->kernel_object = kernel_object();
|
||||
queue_aql_packet->kernarg_address = kernarg_address;
|
||||
queue_aql_packet->completion_signal = local_signal;
|
||||
|
||||
// Write header for packet
|
||||
uint32_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
||||
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
||||
__atomic_store_n(reinterpret_cast<uint16_t*>(&queue_aql_packet->header), header,
|
||||
__ATOMIC_RELEASE);
|
||||
|
||||
// Ring doorbell to notify GPU
|
||||
hsa_signal_store_screlease(queue->doorbell_signal, index);
|
||||
|
||||
// Wait for completion signal to be less than 1
|
||||
while (hsa_signal_wait_scacquire(local_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
|
||||
HSA_WAIT_STATE_ACTIVE)) {
|
||||
}
|
||||
|
||||
// Reset signal for next iteration
|
||||
hsa_signal_store_screlease(local_signal, 1);
|
||||
|
||||
// Verify results are still correct after wraparound
|
||||
ASSERT_TRUE(VerifyResult(reinterpret_cast<uint32_t*>(local_dst_buffer), 256));
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
hsa_amd_memory_pool_free(kernarg_address);
|
||||
hsa_signal_destroy(local_signal);
|
||||
hsa_amd_memory_pool_free(local_dst_buffer);
|
||||
|
||||
// Release the counted queue
|
||||
ASSERT_SUCCESS(hsa_amd_counted_queue_release(queue));
|
||||
};
|
||||
|
||||
constexpr int kThreads = 2;
|
||||
std::vector<std::thread> threads;
|
||||
for (int i = 0; i < kThreads; i++) {
|
||||
threads.emplace_back(func);
|
||||
}
|
||||
|
||||
for (auto& th : threads) {
|
||||
th.join();
|
||||
}
|
||||
|
||||
// Verify value of max seen index
|
||||
uint64_t maxId = maxIndexSeen.load();
|
||||
EXPECT_EQ(maxId, (16384 + 5) * kThreads - 1);
|
||||
|
||||
hsa_amd_memory_pool_free(shared_src_buffer);
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#ifndef ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H
|
||||
#define ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H
|
||||
|
||||
|
||||
#include "suites/test_common/test_base.h"
|
||||
|
||||
class CountedQueuesTest : public TestBase {
|
||||
public:
|
||||
explicit CountedQueuesTest();
|
||||
|
||||
// @Brief: Destructor for test case of CountedQueuesTest
|
||||
virtual ~CountedQueuesTest();
|
||||
|
||||
// @Brief: Setup the environment for measurement
|
||||
virtual void SetUp();
|
||||
|
||||
// @Brief: Core measurement execution
|
||||
virtual void Run();
|
||||
|
||||
// @Brief: Clean up and retrive the resource
|
||||
virtual void Close();
|
||||
|
||||
// @Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
// @Brief: Display information about what this test does
|
||||
virtual void DisplayTestInfo(void);
|
||||
|
||||
/// @brief Basic API test to acquire, query and release 1 HW queue
|
||||
void CountedQueueBasicApiTest();
|
||||
|
||||
/// @brief This test verifies that when many queues of the same priority are created,
|
||||
// they are evenly distributed across the limited set of hardware queues, reuse those
|
||||
// hardware queues correctly, and report proper use-counts before and after release
|
||||
void CountedQueues_SamePriority_MaxLimitTest();
|
||||
|
||||
// @brief Test to verify HSA status codes for incorrect arguments sent via API
|
||||
void InvalidArgsTest();
|
||||
|
||||
// @brief Test to verify that counted queues across all priorities each reuse only
|
||||
// their own priority-specific hardware queues
|
||||
void CountedQueuesAllPrioritiesLimitTest();
|
||||
|
||||
/// @brief Test to verify hsa_amd_queue_set_priority() does not work on counted queues
|
||||
void CountedQueuesSetPriorityNackTest();
|
||||
|
||||
/// @brief Test to verify hsa_amd_queue_cu_set_mask() does not work on counted queues
|
||||
void CountedQueuesSetCUMaskNackTest();
|
||||
|
||||
/// @brief Test to verify that a counted queue correctly supports kernel dispatches end-to-end
|
||||
void CountedQueuesDispatchTest();
|
||||
|
||||
/// @brief Test to verify kernel dispatches onto shared queues from multiple user apps even when they
|
||||
// all share the same HW queue
|
||||
void CountedQueuesMultithreadedDispatchTest();
|
||||
|
||||
/// @brief Test to verify ring buffer wrap around when more than queue_size number of
|
||||
// AQL packets are enqueued
|
||||
void CountedQueuesOverflowWrapAroundTest();
|
||||
};
|
||||
|
||||
#endif // ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H
|
||||
@@ -82,6 +82,7 @@
|
||||
#include "suites/functional/filter_devices.h"
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "common/common.h"
|
||||
#include "suites/functional/counted_queues.h"
|
||||
#include "common/os.h"
|
||||
|
||||
static RocrTstGlobals *sRocrtstGlvalues = nullptr;
|
||||
@@ -507,6 +508,69 @@ TEST(rocrtstFunc, Filter_Devices_Test) {
|
||||
);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, Counted_Queue_Basic_Test) {
|
||||
CountedQueuesTest cq;
|
||||
RunCustomTestProlog(&cq);
|
||||
cq.CountedQueueBasicApiTest();
|
||||
RunCustomTestEpilog(&cq);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, Counted_Queue_Same_Priority_Max_Limit_Test) {
|
||||
CountedQueuesTest cq;
|
||||
RunCustomTestProlog(&cq);
|
||||
cq.CountedQueues_SamePriority_MaxLimitTest();
|
||||
RunCustomTestEpilog(&cq);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, Counted_Queue_Invalid_Args_Test) {
|
||||
CountedQueuesTest cq;
|
||||
RunCustomTestProlog(&cq);
|
||||
cq.InvalidArgsTest();
|
||||
RunCustomTestEpilog(&cq);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, Counted_Queue_Multiple_Priorities_Limit_Test) {
|
||||
CountedQueuesTest cq;
|
||||
RunCustomTestProlog(&cq);
|
||||
cq.CountedQueuesAllPrioritiesLimitTest();
|
||||
RunCustomTestEpilog(&cq);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, Counted_Queue_Set_Priority_Nack_Test) {
|
||||
CountedQueuesTest cq;
|
||||
RunCustomTestProlog(&cq);
|
||||
cq.CountedQueuesSetPriorityNackTest();
|
||||
RunCustomTestEpilog(&cq);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, Counted_Queue_Set_CUMask_Nack_Test) {
|
||||
CountedQueuesTest cq;
|
||||
RunCustomTestProlog(&cq);
|
||||
cq.CountedQueuesSetCUMaskNackTest();
|
||||
RunCustomTestEpilog(&cq);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, Counted_Queue_Dispatch_Test) {
|
||||
CountedQueuesTest cq;
|
||||
RunCustomTestProlog(&cq);
|
||||
cq.CountedQueuesDispatchTest();
|
||||
RunCustomTestEpilog(&cq);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, Counted_Queue_Multithreaded_Dispatch_Test) {
|
||||
CountedQueuesTest cq;
|
||||
RunCustomTestProlog(&cq);
|
||||
cq.CountedQueuesMultithreadedDispatchTest();
|
||||
RunCustomTestEpilog(&cq);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, Counted_Queue_Overflow_And_Wraparound_Test) {
|
||||
CountedQueuesTest cq;
|
||||
RunCustomTestProlog(&cq);
|
||||
cq.CountedQueuesOverflowWrapAroundTest();
|
||||
RunCustomTestEpilog(&cq);
|
||||
}
|
||||
|
||||
TEST(rocrtstNeg, Memory_Negative_Tests) {
|
||||
RUN_IF_NOT_EMU_MODE(
|
||||
MemoryAllocateNegativeTest mt;
|
||||
|
||||
@@ -54,6 +54,5 @@
|
||||
test; \
|
||||
}
|
||||
|
||||
|
||||
#endif // ROCRTST_SUITES_TEST_COMMON_MAIN_H_
|
||||
|
||||
|
||||
@@ -217,6 +217,7 @@ set ( SRCS core/driver/driver.cpp
|
||||
core/runtime/cache.cpp
|
||||
core/runtime/svm_profiler.cpp
|
||||
core/runtime/thunk_loader.cpp
|
||||
core/runtime/counted_queue_manager.cpp
|
||||
core/common/hsa_table_interface.cpp
|
||||
loader/executable.cpp
|
||||
libamdhsacode/amd_elf_image.cpp
|
||||
|
||||
@@ -1345,6 +1345,18 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi
|
||||
size_copied, status);
|
||||
}
|
||||
|
||||
hsa_status_t HSA_API hsa_amd_counted_queue_acquire(
|
||||
hsa_agent_t agent, hsa_queue_type_t type, hsa_amd_queue_priority_t priority,
|
||||
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data,
|
||||
uint64_t flags, hsa_queue_t** queue) {
|
||||
return amdExtTable->hsa_amd_counted_queue_acquire_fn(agent, type, priority, callback, data, flags,
|
||||
queue);
|
||||
}
|
||||
|
||||
hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue) {
|
||||
return amdExtTable->hsa_amd_counted_queue_release_fn(queue);
|
||||
}
|
||||
|
||||
// Tools only table interfaces.
|
||||
namespace rocr {
|
||||
|
||||
|
||||
@@ -64,6 +64,28 @@
|
||||
extern r_debug _amdgpu_r_debug;
|
||||
|
||||
namespace rocr {
|
||||
|
||||
/// @brief Mapping between priority type used internally within ROCR to the type used by KFD
|
||||
|
||||
// Highest queue priority allowed for HSA user is HSA_QUEUE_PRIORITY_HIGH
|
||||
// HSA_QUEUE_PRIORITY_MAXIMUM is reserved for PC Sampling and can only be allocated internally
|
||||
// in ROCR
|
||||
__forceinline HSA_QUEUE_PRIORITY HsaInternalToKfdPriority(
|
||||
rocr::HSA::hsa_amd_queue_priority_internal_t priority) {
|
||||
switch (priority) {
|
||||
case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_LOW:
|
||||
return HSA_QUEUE_PRIORITY_MINIMUM;
|
||||
case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL:
|
||||
return HSA_QUEUE_PRIORITY_NORMAL;
|
||||
case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_HIGH:
|
||||
return HSA_QUEUE_PRIORITY_HIGH;
|
||||
case rocr::HSA::HSA_AMD_QUEUE_PRIORITY_MAXIMUM:
|
||||
return HSA_QUEUE_PRIORITY_MAXIMUM;
|
||||
default:
|
||||
return HSA_QUEUE_PRIORITY_NORMAL;
|
||||
}
|
||||
}
|
||||
|
||||
namespace AMD {
|
||||
|
||||
#if defined(__linux__)
|
||||
@@ -370,10 +392,13 @@ hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) {
|
||||
}
|
||||
|
||||
hsa_status_t KfdDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id,
|
||||
void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
|
||||
HsaQueueResource& queue_resource) const {
|
||||
if (HSAKMT_CALL(hsaKmtCreateQueueExt(node_id, type, queue_pct, priority, sdma_engine_id,
|
||||
// Convert from ROCR internal priority type to KFD type
|
||||
HSA_QUEUE_PRIORITY kfd_priority = HsaInternalToKfdPriority(priority);
|
||||
|
||||
if (HSAKMT_CALL(hsaKmtCreateQueueExt(node_id, type, queue_pct, kfd_priority, sdma_engine_id,
|
||||
queue_addr, queue_size_bytes, event, &queue_resource)) !=
|
||||
HSAKMT_STATUS_SUCCESS) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
@@ -389,9 +414,12 @@ hsa_status_t KfdDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
|
||||
}
|
||||
|
||||
hsa_status_t KfdDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, void* queue_addr,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr,
|
||||
uint64_t queue_size, HsaEvent* event) const {
|
||||
if (HSAKMT_CALL(hsaKmtUpdateQueue(queue_id, queue_pct, priority, queue_addr, queue_size,
|
||||
// Convert from ROCR internal priority type to KFD type
|
||||
HSA_QUEUE_PRIORITY kfd_priority = HsaInternalToKfdPriority(priority);
|
||||
|
||||
if (HSAKMT_CALL(hsaKmtUpdateQueue(queue_id, queue_pct, kfd_priority, queue_addr, queue_size,
|
||||
event)) != HSAKMT_STATUS_SUCCESS) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
+2
-2
@@ -430,7 +430,7 @@ hsa_status_t KfdVirtioDriver::MakeMemoryUnresident(const void* mem) const {
|
||||
}
|
||||
|
||||
hsa_status_t KfdVirtioDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id,
|
||||
void* queue_addr, uint64_t queue_size_bytes,
|
||||
HsaEvent* event, HsaQueueResource& queue_resource) const {
|
||||
if (vhsaKmtCreateQueueExt(node_id, type, queue_pct, priority, sdma_engine_id, queue_addr,
|
||||
@@ -447,7 +447,7 @@ hsa_status_t KfdVirtioDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
|
||||
}
|
||||
|
||||
hsa_status_t KfdVirtioDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_percentage,
|
||||
HSA_QUEUE_PRIORITY priority, void* queue_mem,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority, void* queue_mem,
|
||||
uint64_t queue_size, HsaEvent* event) const {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
@@ -383,7 +383,7 @@ hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id,
|
||||
void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
|
||||
HsaQueueResource& queue_resource) const {
|
||||
queue_resource.QueueId = AMDXDNA_INVALID_CTX_HANDLE;
|
||||
@@ -407,7 +407,7 @@ hsa_status_t XdnaDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, void* queue_addr,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr,
|
||||
uint64_t queue_size, HsaEvent* event) const {
|
||||
// AIE doesn't support queue updates.
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
|
||||
@@ -73,7 +73,7 @@ class AieAqlQueue : public core::Queue,
|
||||
~AieAqlQueue();
|
||||
|
||||
hsa_status_t Inactivate() override;
|
||||
hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override;
|
||||
hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override;
|
||||
void Destroy() override;
|
||||
uint64_t LoadReadIndexRelaxed() override;
|
||||
uint64_t LoadReadIndexAcquire() override;
|
||||
|
||||
@@ -73,7 +73,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
|
||||
hsa_status_t Inactivate() override;
|
||||
|
||||
/// @brief Change the scheduling priority of the queue
|
||||
hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override;
|
||||
hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override;
|
||||
|
||||
/// @brief Destroy ref counted queue
|
||||
void Destroy() override;
|
||||
@@ -316,7 +316,7 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
|
||||
bool suspended_;
|
||||
|
||||
// Thunk dispatch and wavefront scheduling priority
|
||||
HSA_QUEUE_PRIORITY priority_;
|
||||
HSA::hsa_amd_queue_priority_internal_t priority_;
|
||||
|
||||
// Exception notification signal
|
||||
Signal* exception_signal_;
|
||||
|
||||
@@ -62,6 +62,7 @@
|
||||
#include "core/util/locks.h"
|
||||
#include "core/util/small_heap.h"
|
||||
#include "pcs/pcs_runtime.h"
|
||||
#include "core/inc/counted_queue_manager.h"
|
||||
|
||||
namespace rocr {
|
||||
namespace AMD {
|
||||
@@ -342,6 +343,16 @@ class GpuAgent : public GpuAgentInt {
|
||||
void AcquireQueueAltScratch(ScratchInfo& scratch) override;
|
||||
void ReleaseQueueAltScratch(ScratchInfo& scratch) override;
|
||||
|
||||
// @brief Create a pool of shared queues for multiple user applications within a max limit
|
||||
hsa_status_t AcquireCountedQueue(hsa_queue_type_t type,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority,
|
||||
void (*callback)(hsa_status_t, hsa_queue_t*, void*),
|
||||
void* data, uint64_t flags,
|
||||
hsa_queue_t** out_queue);
|
||||
|
||||
// @brief Release a queue earlier used by application
|
||||
hsa_status_t ReleaseCountedQueue(hsa_queue_t* queue);
|
||||
|
||||
// @brief Override from AMD::GpuAgentInt.
|
||||
void TranslateTime(core::Signal* signal, hsa_amd_profiling_dispatch_time_t& time) override;
|
||||
|
||||
@@ -645,6 +656,9 @@ class GpuAgent : public GpuAgentInt {
|
||||
// @brief HSA profile.
|
||||
hsa_profile_t profile_;
|
||||
|
||||
// @brief Pool of shared queues owned by this agent
|
||||
rocr::core::CountedQueuePoolManager queue_pool_;
|
||||
|
||||
void* trap_code_buf_;
|
||||
|
||||
size_t trap_code_buf_size_;
|
||||
|
||||
@@ -97,10 +97,10 @@ public:
|
||||
uint32_t node_id) override;
|
||||
hsa_status_t FreeMemory(void *mem, size_t size) override;
|
||||
hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr,
|
||||
uint64_t queue_size_bytes, HsaEvent* event,
|
||||
HsaQueueResource& queue_resource) const override;
|
||||
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority,
|
||||
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA::hsa_amd_queue_priority_internal_t priority,
|
||||
void* queue_addr, uint64_t queue_size, HsaEvent* event) const override;
|
||||
hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
|
||||
hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
|
||||
|
||||
@@ -90,12 +90,12 @@ class KfdVirtioDriver final : public core::Driver {
|
||||
const uint32_t* nodes) const override;
|
||||
hsa_status_t MakeMemoryUnresident(const void* mem) const override;
|
||||
hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr,
|
||||
uint64_t queue_size_bytes, HsaEvent* event,
|
||||
HsaQueueResource& queue_resource) const override;
|
||||
hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
|
||||
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_percentage,
|
||||
HSA_QUEUE_PRIORITY priority, void* queue_mem, uint64_t queue_size,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority, void* queue_mem, uint64_t queue_size,
|
||||
HsaEvent* event) const override;
|
||||
hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t num_cu_mask,
|
||||
uint32_t* cu_mask) const override;
|
||||
|
||||
@@ -204,10 +204,10 @@ public:
|
||||
uint32_t node_id) override;
|
||||
hsa_status_t FreeMemory(void *mem, size_t size) override;
|
||||
hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id, void* queue_addr,
|
||||
uint64_t queue_size_bytes, HsaEvent* event,
|
||||
HsaQueueResource& queue_resource) const override;
|
||||
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority,
|
||||
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA::hsa_amd_queue_priority_internal_t priority,
|
||||
void* queue_addr, uint64_t queue_size, HsaEvent* event) const override;
|
||||
hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
|
||||
hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#ifndef HSA_RUNTIME_CORE_INC_COUNTED_QUEUE_MANAGER_H_
|
||||
#define HSA_RUNTIME_CORE_INC_COUNTED_QUEUE_MANAGER_H_
|
||||
|
||||
|
||||
#include "hsa.h"
|
||||
#include "hsa_ext_amd.h"
|
||||
#include "core/inc/agent.h"
|
||||
#include "core/inc/runtime.h"
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
namespace rocr {
|
||||
namespace core {
|
||||
|
||||
// Wrapper for a logical counted queue (unique handle + callback)
|
||||
struct CountedQueue {
|
||||
core::Queue* hw_queue; // this will store the public handle of HW Queue (hsa_queue_t)
|
||||
void (*callback)(hsa_status_t, hsa_queue_t*, void*);
|
||||
void* callback_data;
|
||||
|
||||
CountedQueue(core::Queue* hw, void (*cb)(hsa_status_t, hsa_queue_t*, void*), void* data)
|
||||
: hw_queue(hw), callback(cb), callback_data(data) {}
|
||||
};
|
||||
|
||||
// Manages the pool of counted queues for a single GPU agent
|
||||
class CountedQueuePoolManager {
|
||||
public:
|
||||
explicit CountedQueuePoolManager(core::Agent*);
|
||||
|
||||
// Acquire a queue (either reuse or create new)
|
||||
hsa_status_t AcquireQueue(hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority,
|
||||
void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data,
|
||||
uint64_t flags, hsa_queue_t** out_queue);
|
||||
|
||||
// Release a logical queue
|
||||
hsa_status_t ReleaseQueue(hsa_queue_t* queue);
|
||||
|
||||
// Called during hsa_shutdown to remove all user and CP queues
|
||||
void Cleanup();
|
||||
|
||||
private:
|
||||
core::Queue* FindOrCreateHardwareQueue(hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority,
|
||||
void (*callback)(hsa_status_t, hsa_queue_t*, void*),
|
||||
void* data, uint64_t flags);
|
||||
|
||||
core::Agent* agent_; // pointer to the gpu agent that owns this pool
|
||||
uint32_t max_hw_queues_;
|
||||
std::mutex mutex_;
|
||||
|
||||
// Pool of hw queues by priority on the agent
|
||||
std::map<HSA::hsa_amd_queue_priority_internal_t, std::vector<core::Queue*>> hw_queue_pools_;
|
||||
|
||||
// Map from unique handle to CountedQueue (hw queue, metadata per acquire request)
|
||||
std::map<hsa_queue_t*, std::unique_ptr<CountedQueue>> counted_queues_;
|
||||
};
|
||||
|
||||
} // namespace core
|
||||
} // namespace rocr
|
||||
|
||||
#endif // HSA_RUNTIME_CORE_INC_COUNTED_QUEUE_MANAGER_H_
|
||||
@@ -50,6 +50,7 @@
|
||||
#include "core/inc/memory_region.h"
|
||||
#include "hsakmt/hsakmttypes.h"
|
||||
#include "inc/hsa.h"
|
||||
#include "core/inc/hsa_internal.h"
|
||||
|
||||
namespace rocr {
|
||||
namespace core {
|
||||
@@ -159,7 +160,7 @@ public:
|
||||
/// @param[in] event HsaEvent for event-driven callbacks.
|
||||
/// @param[out] queue_resource Queue resource information populated by the driver.
|
||||
virtual hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority, uint32_t sdma_engine_id,
|
||||
void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
|
||||
HsaQueueResource& queue_resource) const = 0;
|
||||
|
||||
@@ -175,7 +176,7 @@ public:
|
||||
/// @param[in] queue_size_bytes Size of the queue's ring buffer in bytes.
|
||||
/// @param[in] event HsaEvent for event-driven callbacks.
|
||||
virtual hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, void* queue_addr,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority, void* queue_addr,
|
||||
uint64_t queue_size_bytes, HsaEvent* event) const = 0;
|
||||
|
||||
/// @brief Set the CU mask for a queue.
|
||||
|
||||
@@ -60,7 +60,7 @@ class HostQueue : public Queue {
|
||||
~HostQueue();
|
||||
|
||||
hsa_status_t Inactivate() override { return HSA_STATUS_SUCCESS; }
|
||||
hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override {
|
||||
hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override {
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
}
|
||||
|
||||
|
||||
@@ -381,6 +381,15 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi
|
||||
uint64_t size, int64_t file_offset,
|
||||
uint64_t *size_copied, int32_t *status);
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_counted_queue_acquire(hsa_agent_t agent, hsa_queue_type_t type,
|
||||
hsa_amd_queue_priority_t priority,
|
||||
void (*callback)(hsa_status_t status,
|
||||
hsa_queue_t* source, void* data),
|
||||
void* data, uint64_t flags, hsa_queue_t** queue);
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue);
|
||||
|
||||
// Mirrors Amd Extension Apis
|
||||
hsa_status_t HSA_API hsa_amd_enable_logging(uint8_t* flags, void* file);
|
||||
|
||||
|
||||
@@ -44,10 +44,18 @@
|
||||
#define HSA_RUNTIME_CORE_INC_HSA_INTERNAL_H
|
||||
|
||||
#include "inc/hsa.h"
|
||||
#include "inc/hsa_ext_amd.h"
|
||||
|
||||
namespace rocr {
|
||||
namespace HSA {
|
||||
|
||||
typedef enum hsa_amd_queue_priority_internal_s {
|
||||
HSA_AMD_QUEUE_PRIORITY_LOW = HSA_AMD_QUEUE_PRIORITY_LOW,
|
||||
HSA_AMD_QUEUE_PRIORITY_NORMAL = HSA_AMD_QUEUE_PRIORITY_NORMAL,
|
||||
HSA_AMD_QUEUE_PRIORITY_HIGH = HSA_AMD_QUEUE_PRIORITY_HIGH,
|
||||
HSA_AMD_QUEUE_PRIORITY_MAXIMUM = HSA_AMD_QUEUE_PRIORITY_HIGH + 1,
|
||||
} hsa_amd_queue_priority_internal_t;
|
||||
|
||||
// Define core namespace interfaces - copy of function declarations in hsa.h
|
||||
hsa_status_t hsa_init();
|
||||
hsa_status_t hsa_shut_down();
|
||||
|
||||
@@ -66,7 +66,7 @@ class QueueWrapper : public Queue {
|
||||
explicit QueueWrapper(std::unique_ptr<Queue> queue)
|
||||
: Queue(static_cast<core::SharedQueue*>(core::Runtime::runtime_singleton_->system_allocator()(
|
||||
sizeof(core::SharedQueue), 4096, 0, 0)),
|
||||
0),
|
||||
0, nullptr),
|
||||
wrapped(std::move(queue)) {
|
||||
memcpy(&amd_queue_, &wrapped->amd_queue_, sizeof(amd_queue_));
|
||||
wrapped->set_public_handle(wrapped.get(), public_handle_);
|
||||
@@ -77,7 +77,7 @@ class QueueWrapper : public Queue {
|
||||
}
|
||||
|
||||
hsa_status_t Inactivate() override { return wrapped->Inactivate(); }
|
||||
hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override {
|
||||
hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) override {
|
||||
return wrapped->SetPriority(priority);
|
||||
}
|
||||
uint64_t LoadReadIndexAcquire() override { return wrapped->LoadReadIndexAcquire(); }
|
||||
|
||||
@@ -168,12 +168,15 @@ All funtions other than Convert and public_handle must be virtual.
|
||||
*/
|
||||
class Queue : public Checked<0xFA3906A679F9DB49> {
|
||||
public:
|
||||
Queue(SharedQueue* shared_queue, uint64_t queue_flags)
|
||||
: Queue(shared_queue, queue_flags, false) {}
|
||||
Queue(SharedQueue* shared_queue, uint64_t queue_flags, core::Agent* agent)
|
||||
: Queue(shared_queue, queue_flags, false, agent) {}
|
||||
|
||||
Queue(SharedQueue* shared_queue, uint64_t queue_flags, bool pcie_write_ordering)
|
||||
Queue(SharedQueue* shared_queue, uint64_t queue_flags, bool pcie_write_ordering, core::Agent* agent)
|
||||
: amd_queue_(shared_queue->amd_queue),
|
||||
use_count(0),
|
||||
is_counted_queue(false),
|
||||
shared_queue_(shared_queue),
|
||||
agent_(agent),
|
||||
flags_(queue_flags),
|
||||
pcie_write_ordering_(pcie_write_ordering) {
|
||||
public_handle_ = Convert(this);
|
||||
@@ -213,7 +216,7 @@ class Queue : public Checked<0xFA3906A679F9DB49> {
|
||||
virtual hsa_status_t Inactivate() = 0;
|
||||
|
||||
/// @brief Change the scheduling priority of the queue
|
||||
virtual hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) = 0;
|
||||
virtual hsa_status_t SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) = 0;
|
||||
|
||||
/// @brief Reads the Read Index of Queue using Acquire semantics
|
||||
///
|
||||
@@ -372,6 +375,13 @@ class Queue : public Checked<0xFA3906A679F9DB49> {
|
||||
|
||||
hsa_queue_t* public_handle() const { return public_handle_; }
|
||||
|
||||
// Get a pointer to the agent that owns this queue
|
||||
core::Agent* GetAgent() { return agent_; }
|
||||
|
||||
// @brief Attributes specifically for counted queue types
|
||||
uint32_t use_count;
|
||||
bool is_counted_queue;
|
||||
|
||||
typedef void* rtti_t;
|
||||
|
||||
bool IsType(rtti_t id) { return _IsA(id); }
|
||||
@@ -401,6 +411,8 @@ class Queue : public Checked<0xFA3906A679F9DB49> {
|
||||
|
||||
SharedQueue* shared_queue_;
|
||||
|
||||
core::Agent* agent_; // pointer to the agent that owns this queue
|
||||
|
||||
hsa_queue_t* public_handle_;
|
||||
|
||||
/// Next available queue id.
|
||||
|
||||
@@ -67,7 +67,7 @@ namespace AMD {
|
||||
|
||||
AieAqlQueue::AieAqlQueue(core::SharedQueue* shared_queue, AieAgent* agent, size_t req_size_pkts,
|
||||
uint32_t node_id, uint64_t flags)
|
||||
: Queue(shared_queue, flags),
|
||||
: Queue(shared_queue, flags, agent),
|
||||
LocalSignal(0, false),
|
||||
DoorbellSignal(signal()),
|
||||
agent_(*agent),
|
||||
@@ -104,7 +104,7 @@ AieAqlQueue::AieAqlQueue(core::SharedQueue* shared_queue, AieAgent* agent, size_
|
||||
|
||||
HsaQueueResource queue_resource = {};
|
||||
hsa_status_t status =
|
||||
agent_.driver().CreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 0, HSA_QUEUE_PRIORITY_NORMAL, 0,
|
||||
agent_.driver().CreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 0, rocr::HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL, 0,
|
||||
nullptr, queue_size_bytes_, nullptr, queue_resource);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
throw AMD::hsa_exception(status, "Failed to create a hardware context for an AIE queue.");
|
||||
@@ -135,7 +135,7 @@ hsa_status_t AieAqlQueue::Inactivate() {
|
||||
return status;
|
||||
}
|
||||
|
||||
hsa_status_t AieAqlQueue::SetPriority(HSA_QUEUE_PRIORITY priority) {
|
||||
hsa_status_t AieAqlQueue::SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -277,6 +277,13 @@ hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute,
|
||||
// Hardware doorbell supports AQL semantics.
|
||||
*static_cast<uint64_t*>(value) = reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
|
||||
break;
|
||||
case HSA_QUEUE_INFO_USE_COUNT:
|
||||
// AIE queues do not support counted queue features
|
||||
*static_cast<uint32_t*>(value) = static_cast<uint32_t>(-1);
|
||||
break;
|
||||
case HSA_QUEUE_INFO_HW_ID:
|
||||
*static_cast<uint32_t*>(value) = public_handle()->id;
|
||||
break;
|
||||
default:
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
@@ -81,7 +81,7 @@ namespace AMD {
|
||||
AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_size_pkts,
|
||||
HSAuint32 node_id, ScratchInfo& scratch, core::HsaEventCallback callback,
|
||||
void* err_data, uint64_t flags)
|
||||
: Queue(shared_queue, flags, !agent->is_xgmi_cpu_gpu()),
|
||||
: Queue(shared_queue, flags, !agent->is_xgmi_cpu_gpu(), agent),
|
||||
LocalSignal(0, false),
|
||||
DoorbellSignal(signal()),
|
||||
ring_buf_(nullptr),
|
||||
@@ -97,7 +97,7 @@ AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_
|
||||
dynamicScratchState(0),
|
||||
exceptionState(0),
|
||||
suspended_(false),
|
||||
priority_(HSA_QUEUE_PRIORITY_NORMAL),
|
||||
priority_(HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL),
|
||||
exception_signal_(nullptr) {
|
||||
|
||||
// Queue size is a function of several restrictions.
|
||||
@@ -495,6 +495,21 @@ hsa_status_t AqlQueue::GetInfo(hsa_queue_info_attribute_t attribute, void* value
|
||||
*(reinterpret_cast<uint64_t*>(value)) =
|
||||
reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
|
||||
break;
|
||||
case HSA_QUEUE_INFO_USE_COUNT:
|
||||
if (!is_counted_queue) {
|
||||
*static_cast<uint32_t*>(value) = static_cast<uint32_t>(-1);
|
||||
} else {
|
||||
if (use_count == 0) {
|
||||
// Queue was released
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
*static_cast<uint32_t*>(value) = use_count;
|
||||
}
|
||||
break;
|
||||
case HSA_QUEUE_INFO_HW_ID:
|
||||
// Return the hardware queue ID for both counted and non-counted queues
|
||||
*static_cast<uint32_t*>(value) = public_handle()->id;
|
||||
break;
|
||||
default:
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
@@ -631,7 +646,7 @@ hsa_status_t AqlQueue::Inactivate() {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t AqlQueue::SetPriority(HSA_QUEUE_PRIORITY priority) {
|
||||
hsa_status_t AqlQueue::SetPriority(HSA::hsa_amd_queue_priority_internal_t priority) {
|
||||
if (suspended_) {
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
}
|
||||
|
||||
@@ -184,7 +184,7 @@ hsa_status_t BlitSdma<useGCR>::Initialize(const core::Agent& agent, bool use_xgm
|
||||
// boolean flag
|
||||
const HSA_QUEUE_TYPE kQueueType_ = rec_eng >= 0 ? HSA_QUEUE_SDMA_BY_ENG_ID :
|
||||
(use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA);
|
||||
if (agent_->driver().CreateQueue(agent_->node_id(), kQueueType_, 100, HSA_QUEUE_PRIORITY_MAXIMUM,
|
||||
if (agent_->driver().CreateQueue(agent_->node_id(), kQueueType_, 100, HSA::HSA_AMD_QUEUE_PRIORITY_MAXIMUM,
|
||||
rec_eng, queue_start_addr_, kQueueSize, nullptr,
|
||||
queue_resource_) != HSA_STATUS_SUCCESS) {
|
||||
LogPrint(HSA_AMD_LOG_FLAG_INFO, "Failed to create queue, size=%d, type=%d,"
|
||||
|
||||
@@ -99,6 +99,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
|
||||
current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
|
||||
scratch_used_large_(0),
|
||||
queues_(),
|
||||
queue_pool_(this),
|
||||
trap_code_buf_(NULL),
|
||||
trap_code_buf_size_(0),
|
||||
doorbell_queue_map_(NULL),
|
||||
@@ -774,13 +775,13 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) {
|
||||
|
||||
void GpuAgent::InitDma() {
|
||||
// Setup lazy init pointers on queues and blits.
|
||||
auto queue_lambda = [this](HSA_QUEUE_PRIORITY priority = HSA_QUEUE_PRIORITY_NORMAL) {
|
||||
auto queue_lambda = [this](HSA::hsa_amd_queue_priority_internal_t priority = HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL) {
|
||||
auto queue = CreateInterceptibleQueue();
|
||||
if (queue == nullptr)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
|
||||
"Internal queue creation failed.");
|
||||
|
||||
if (priority != HSA_QUEUE_PRIORITY_NORMAL)
|
||||
if (priority != HSA::HSA_AMD_QUEUE_PRIORITY_NORMAL)
|
||||
if (queue->SetPriority(priority) != HSA_STATUS_SUCCESS)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR,
|
||||
"Failed to increase queue priority for PC Sampling");
|
||||
@@ -795,7 +796,7 @@ void GpuAgent::InitDma() {
|
||||
// Dedicated compute queue for PC Sampling CP-DMA commands. We need a dedicated queue that runs at
|
||||
// highest priority because we do not want the CP-DMA commands to be delayed/blocked due to
|
||||
// other dispatches/barriers that could be in the other AQL queues.
|
||||
queues_[QueuePCSampling].reset([queue_lambda]() { return queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM); });
|
||||
queues_[QueuePCSampling].reset([queue_lambda]() { return queue_lambda(HSA::HSA_AMD_QUEUE_PRIORITY_MAXIMUM); });
|
||||
|
||||
// Decide which engine to use for blits.
|
||||
auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev, uint32_t rec_eng) {
|
||||
@@ -924,6 +925,10 @@ void GpuAgent::PreloadBlits() {
|
||||
void GpuAgent::ReleaseResources() {
|
||||
if (this->Enabled()) {
|
||||
this->Disable();
|
||||
|
||||
// Remove all shared hardware queues from pool
|
||||
queue_pool_.Cleanup();
|
||||
|
||||
for (auto& blit : blits_) {
|
||||
if (!blit.empty()) {
|
||||
hsa_status_t status = blit->Destroy();
|
||||
@@ -1829,6 +1834,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u
|
||||
|
||||
auto aql_queue = new AqlQueue(shared_queue, this, size, node_id(), scratch, event_callback, data,
|
||||
flags);
|
||||
|
||||
*queue = aql_queue;
|
||||
aql_queues_.push_back(aql_queue);
|
||||
|
||||
@@ -3519,5 +3525,17 @@ hsa_status_t GpuAgent::PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& sessi
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t GpuAgent::AcquireCountedQueue(hsa_queue_type_t type,
|
||||
HSA::hsa_amd_queue_priority_internal_t priority,
|
||||
void (*callback)(hsa_status_t, hsa_queue_t*, void*),
|
||||
void* data, uint64_t flags,
|
||||
hsa_queue_t** out_queue) {
|
||||
return queue_pool_.AcquireQueue(type, priority, callback, data, flags, out_queue);
|
||||
}
|
||||
|
||||
hsa_status_t GpuAgent::ReleaseCountedQueue(hsa_queue_t* queue) {
|
||||
return queue_pool_.ReleaseQueue(queue);
|
||||
}
|
||||
|
||||
} // namespace amd
|
||||
} // namespace rocr
|
||||
|
||||
@@ -0,0 +1,145 @@
|
||||
/*
|
||||
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "core/inc/counted_queue_manager.h"
|
||||
#include "core/inc/agent.h"
|
||||
#include "core/inc/runtime.h"
|
||||
|
||||
namespace rocr {
|
||||
namespace core {
|
||||
|
||||
constexpr size_t DEFAULT_QUEUE_SIZE = 16384;
|
||||
|
||||
CountedQueuePoolManager::CountedQueuePoolManager(core::Agent* agent) : agent_(agent) {
|
||||
// Read in GPU_MAX_HW_QUEUES flag value
|
||||
max_hw_queues_ = core::Runtime::runtime_singleton_->flag().cp_queues_limit();
|
||||
}
|
||||
|
||||
hsa_status_t CountedQueuePoolManager::AcquireQueue(
|
||||
hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority,
|
||||
void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data, uint64_t flags,
|
||||
hsa_queue_t** out_queue) {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
|
||||
core::Queue* core_queue = FindOrCreateHardwareQueue(type, priority, callback, data, flags);
|
||||
if (!core_queue) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
|
||||
// Create unique SharedQueue structure and store the unique handle in it
|
||||
SharedQueue* shared_queue = new (std::nothrow) SharedQueue();
|
||||
if (!shared_queue) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
|
||||
// Copy amd_queue from HW queue
|
||||
shared_queue->amd_queue = core_queue->amd_queue_;
|
||||
|
||||
// Point to the SAME underlying core::Queue (shared HW queue)
|
||||
shared_queue->core_queue = core_queue;
|
||||
|
||||
// Create a unique handle from this new SharedQueue
|
||||
hsa_queue_t* unique_handle = &shared_queue->amd_queue.hsa_queue;
|
||||
|
||||
// Track metadata
|
||||
auto counted_q = std::make_unique<CountedQueue>(core_queue, callback, data);
|
||||
counted_queues_[unique_handle] = std::move(counted_q);
|
||||
|
||||
// Increment use count
|
||||
core_queue->use_count++;
|
||||
|
||||
// Mark as a counted queue, if not already set
|
||||
if (!core_queue->is_counted_queue) {
|
||||
core_queue->is_counted_queue = true;
|
||||
}
|
||||
|
||||
*out_queue = unique_handle;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
core::Queue* CountedQueuePoolManager::FindOrCreateHardwareQueue(
|
||||
hsa_queue_type_t type, HSA::hsa_amd_queue_priority_internal_t priority,
|
||||
void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data, uint64_t flags) {
|
||||
auto& pool = hw_queue_pools_[priority];
|
||||
|
||||
// Reuse least-used queue if max reached
|
||||
if (pool.size() >= max_hw_queues_) {
|
||||
core::Queue* least_used = nullptr;
|
||||
uint32_t min_count = UINT32_MAX;
|
||||
|
||||
for (auto* q : pool) {
|
||||
if (q->use_count < min_count) {
|
||||
min_count = q->use_count;
|
||||
least_used = q;
|
||||
}
|
||||
}
|
||||
return least_used;
|
||||
}
|
||||
|
||||
// Create a new hardware queue
|
||||
core::Queue* cmd_queue = nullptr;
|
||||
hsa_status_t status =
|
||||
agent_->QueueCreate(DEFAULT_QUEUE_SIZE, type, 0, callback, data, 0, 0, &cmd_queue);
|
||||
if (status != HSA_STATUS_SUCCESS) return nullptr;
|
||||
|
||||
status = cmd_queue->SetPriority(priority);
|
||||
if (status != HSA_STATUS_SUCCESS) return nullptr;
|
||||
|
||||
cmd_queue->SetProfiling(true);
|
||||
|
||||
// Add to pool
|
||||
pool.push_back(cmd_queue);
|
||||
return cmd_queue;
|
||||
}
|
||||
|
||||
hsa_status_t CountedQueuePoolManager::ReleaseQueue(hsa_queue_t* queue) {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
auto it = counted_queues_.find(queue);
|
||||
if (it == counted_queues_.end()) return HSA_STATUS_ERROR;
|
||||
|
||||
CountedQueue* counted_q = it->second.get();
|
||||
|
||||
// Decrement internal ref count inside core::Queue object
|
||||
if (counted_q->hw_queue->use_count > 0) {
|
||||
counted_q->hw_queue->use_count--;
|
||||
|
||||
// Remove unique handle from map when it is no longer in use by an application
|
||||
if (counted_q->hw_queue->use_count == 0) {
|
||||
counted_queues_.erase(queue);
|
||||
|
||||
// free the associated shared_queue when removing the counted_queue
|
||||
SharedQueue* shared = reinterpret_cast<SharedQueue*>(
|
||||
reinterpret_cast<char*>(queue) - offsetof(SharedQueue, amd_queue.hsa_queue));
|
||||
delete shared;
|
||||
}
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void CountedQueuePoolManager::Cleanup() {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
|
||||
// Destroy hardware queues
|
||||
for (auto& priority_pool : hw_queue_pools_) {
|
||||
for (auto* hw_queue : priority_pool.second) {
|
||||
if (hw_queue) {
|
||||
hw_queue->Destroy();
|
||||
}
|
||||
}
|
||||
priority_pool.second.clear();
|
||||
}
|
||||
hw_queue_pools_.clear();
|
||||
|
||||
// Clean up counted and shared queues
|
||||
for (auto& cq : counted_queues_) {
|
||||
// Recover SharedQueue from unique handle and free memory
|
||||
hsa_queue_t* queue_handle = cq.first;
|
||||
SharedQueue* shared = reinterpret_cast<SharedQueue*>(
|
||||
reinterpret_cast<char*>(queue_handle) - offsetof(SharedQueue, amd_queue.hsa_queue));
|
||||
delete shared;
|
||||
}
|
||||
counted_queues_.clear();
|
||||
}
|
||||
|
||||
} // namespace core
|
||||
} // namespace rocr
|
||||
@@ -50,7 +50,7 @@ namespace core {
|
||||
|
||||
HostQueue::HostQueue(core::SharedQueue* shared_queue, hsa_region_t region, uint32_t ring_size,
|
||||
hsa_queue_type32_t type, uint32_t features, hsa_signal_t doorbell_signal)
|
||||
: Queue(shared_queue, 0), size_(ring_size) {
|
||||
: Queue(shared_queue, 0, nullptr), size_(ring_size) {
|
||||
HSA::hsa_memory_register(this, sizeof(HostQueue));
|
||||
MAKE_NAMED_SCOPE_GUARD(registerGuard,
|
||||
[&]() { HSA::hsa_memory_deregister(this, sizeof(HostQueue)); });
|
||||
|
||||
@@ -87,7 +87,7 @@ void HsaApiTable::Init() {
|
||||
// they can add preprocessor macros on the new functions
|
||||
|
||||
constexpr size_t expected_core_api_table_size = 1016;
|
||||
constexpr size_t expected_amd_ext_table_size = 624;
|
||||
constexpr size_t expected_amd_ext_table_size = 640;
|
||||
constexpr size_t expected_image_ext_table_size = 128;
|
||||
constexpr size_t expected_finalizer_ext_table_size = 64;
|
||||
constexpr size_t expected_tools_table_size = 64;
|
||||
@@ -476,6 +476,8 @@ void HsaApiTable::UpdateAmdExts() {
|
||||
amd_ext_api.hsa_amd_enable_logging_fn = AMD::hsa_amd_enable_logging;
|
||||
amd_ext_api.hsa_amd_ais_file_write_fn = AMD::hsa_amd_ais_file_write;
|
||||
amd_ext_api.hsa_amd_ais_file_read_fn = AMD::hsa_amd_ais_file_read;
|
||||
amd_ext_api.hsa_amd_counted_queue_acquire_fn = AMD::hsa_amd_counted_queue_acquire;
|
||||
amd_ext_api.hsa_amd_counted_queue_release_fn = AMD::hsa_amd_counted_queue_release;
|
||||
amd_ext_api.hsa_amd_signal_wait_all_fn = AMD::hsa_amd_signal_wait_all;
|
||||
amd_ext_api.hsa_amd_memory_get_preferred_copy_engine_fn = AMD::hsa_amd_memory_get_preferred_copy_engine;
|
||||
amd_ext_api.hsa_amd_portable_export_dmabuf_v2_fn = AMD::hsa_amd_portable_export_dmabuf_v2;
|
||||
|
||||
@@ -62,6 +62,7 @@
|
||||
#include "core/inc/ipc_signal.h"
|
||||
#include "core/inc/runtime.h"
|
||||
#include "core/inc/signal.h"
|
||||
#include "core/inc/counted_queue_manager.h"
|
||||
|
||||
namespace rocr {
|
||||
|
||||
@@ -714,6 +715,10 @@ hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu
|
||||
|
||||
core::Queue* cmd_queue = core::Queue::Convert(queue);
|
||||
IS_VALID(cmd_queue);
|
||||
|
||||
// Check if this a counted queue; NACK if it is
|
||||
if (cmd_queue->is_counted_queue) return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
|
||||
if (num_cu_mask_count != 0) IS_BAD_PTR(cu_mask);
|
||||
if (num_cu_mask_count % 32 != 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
return cmd_queue->SetCUMasking(num_cu_mask_count, cu_mask);
|
||||
@@ -1183,22 +1188,13 @@ hsa_status_t hsa_amd_queue_set_priority(hsa_queue_t* queue,
|
||||
core::Queue* cmd_queue = core::Queue::Convert(queue);
|
||||
IS_VALID(cmd_queue);
|
||||
|
||||
// Highest queue priority allowed for HSA user is HSA_QUEUE_PRIORITY_HIGH
|
||||
// HSA_QUEUE_PRIORITY_MAXIMUM is reserved for PC Sampling and can only be allocated internally
|
||||
// in ROCR
|
||||
static std::map<hsa_amd_queue_priority_t, HSA_QUEUE_PRIORITY> ext_kmt_priomap = {
|
||||
{HSA_AMD_QUEUE_PRIORITY_LOW, HSA_QUEUE_PRIORITY_MINIMUM},
|
||||
{HSA_AMD_QUEUE_PRIORITY_NORMAL, HSA_QUEUE_PRIORITY_NORMAL},
|
||||
{HSA_AMD_QUEUE_PRIORITY_HIGH, HSA_QUEUE_PRIORITY_HIGH},
|
||||
};
|
||||
// Check if this a counted queue; NACK if it is
|
||||
if (cmd_queue->is_counted_queue) return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
|
||||
auto priority_it = ext_kmt_priomap.find(priority);
|
||||
|
||||
if (priority_it == ext_kmt_priomap.end()) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
return cmd_queue->SetPriority(priority_it->second);
|
||||
// Convert to ROCR internal priority type
|
||||
HSA::hsa_amd_queue_priority_internal_t priority_ = static_cast<HSA::hsa_amd_queue_priority_internal_t>(priority);
|
||||
|
||||
return cmd_queue->SetPriority(priority_);
|
||||
CATCH;
|
||||
}
|
||||
|
||||
@@ -1532,7 +1528,7 @@ hsa_status_t HSA_API hsa_amd_queue_get_info(hsa_queue_t* _queue,
|
||||
|
||||
core::Queue* queue = core::Queue::Convert(_queue);
|
||||
IS_VALID(queue);
|
||||
|
||||
|
||||
return queue->GetInfo(attribute, value);
|
||||
CATCH;
|
||||
}
|
||||
@@ -1576,6 +1572,72 @@ hsa_status_t hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, void *devic
|
||||
CATCH;
|
||||
}
|
||||
|
||||
hsa_status_t HSA_API
|
||||
hsa_amd_counted_queue_acquire(hsa_agent_t agent,
|
||||
hsa_queue_type_t type,
|
||||
hsa_amd_queue_priority_t priority,
|
||||
void (*callback)(hsa_status_t status,
|
||||
hsa_queue_t* source,
|
||||
void* data),
|
||||
void* data,
|
||||
uint64_t flags,
|
||||
hsa_queue_t** queue) {
|
||||
TRY;
|
||||
IS_OPEN();
|
||||
// Basic validation
|
||||
if (queue == nullptr) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
// Check priority
|
||||
if (priority < HSA_AMD_QUEUE_PRIORITY_LOW || priority > HSA_AMD_QUEUE_PRIORITY_HIGH) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
// Only support multi-producer queues
|
||||
if (type != HSA_QUEUE_TYPE_MULTI) {
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE_CREATION;
|
||||
}
|
||||
|
||||
// Convert handle to internal agent
|
||||
core::Agent* core_agent = core::Agent::Convert(agent);
|
||||
IS_VALID(core_agent);
|
||||
if (core_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) {
|
||||
return HSA_STATUS_ERROR_INVALID_AGENT;
|
||||
}
|
||||
AMD::GpuAgent* gpu_agent = static_cast<AMD::GpuAgent*>(core_agent);
|
||||
|
||||
// Convert to ROCR internal priority type
|
||||
HSA::hsa_amd_queue_priority_internal_t priority_ = static_cast<HSA::hsa_amd_queue_priority_internal_t>(priority);
|
||||
|
||||
// Call the queue pool manager
|
||||
return gpu_agent->AcquireCountedQueue(type, priority_, callback, data, flags, queue);
|
||||
CATCH;
|
||||
}
|
||||
|
||||
hsa_status_t HSA_API
|
||||
hsa_amd_counted_queue_release(hsa_queue_t* queue) {
|
||||
TRY;
|
||||
IS_OPEN();
|
||||
// Basic validation
|
||||
if (queue == nullptr) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
core::Queue* core_queue = core::Queue::Convert(queue);
|
||||
IS_VALID(core_queue);
|
||||
|
||||
core::Agent* core_agent = core_queue->GetAgent();
|
||||
IS_VALID(core_agent);
|
||||
if (core_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) {
|
||||
return HSA_STATUS_ERROR_INVALID_AGENT;
|
||||
}
|
||||
AMD::GpuAgent* gpu_agent = static_cast<AMD::GpuAgent*>(core_agent);
|
||||
|
||||
return gpu_agent->ReleaseCountedQueue(queue);
|
||||
CATCH;
|
||||
}
|
||||
|
||||
hsa_status_t hsa_amd_enable_logging(uint8_t* flags, void *file) {
|
||||
TRY;
|
||||
return core::Runtime::runtime_singleton_->EnableLogging(flags, file);
|
||||
|
||||
@@ -438,7 +438,9 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) {
|
||||
hsa_status_t InterceptQueue::GetInfo(hsa_queue_info_attribute_t attribute, void* value) {
|
||||
switch (attribute) {
|
||||
case HSA_AMD_QUEUE_INFO_AGENT:
|
||||
case HSA_AMD_QUEUE_INFO_DOORBELL_ID: {
|
||||
case HSA_AMD_QUEUE_INFO_DOORBELL_ID:
|
||||
case HSA_QUEUE_INFO_USE_COUNT:
|
||||
case HSA_QUEUE_INFO_HW_ID: {
|
||||
if (!AMD::AqlQueue::IsType(wrapped.get())) return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
|
||||
AMD::AqlQueue* aqlQueue = static_cast<AMD::AqlQueue*>(wrapped.get());
|
||||
|
||||
@@ -306,6 +306,8 @@ class Flag {
|
||||
core_dump_disable_ = (var == "1");
|
||||
|
||||
core_dump_pattern_ = os::GetEnvVar("HSA_COREDUMP_PATTERN");
|
||||
var = os::GetEnvVar("GPU_MAX_HW_QUEUES");
|
||||
cp_queues_limit_ = var.empty() ? 4 : atoi(var.c_str());
|
||||
}
|
||||
|
||||
void parse_masks(uint32_t maxGpu, uint32_t maxCU) {
|
||||
@@ -426,6 +428,8 @@ class Flag {
|
||||
|
||||
size_t co_dmacopy_size() const { return co_dmacopy_size_; }
|
||||
|
||||
uint32_t cp_queues_limit() const { return cp_queues_limit_; }
|
||||
|
||||
bool dev_mem_queue_buf() const { return dev_mem_queue_buf_; }
|
||||
|
||||
uint32_t signal_abort_timeout() const { return signal_abort_timeout_; }
|
||||
@@ -545,6 +549,8 @@ class Flag {
|
||||
bool enable_core_dump_progress_ = false;
|
||||
std::string core_dump_pattern_;
|
||||
|
||||
uint32_t cp_queues_limit_;
|
||||
|
||||
// Map GPU index post RVD to its default cu mask.
|
||||
std::map<uint32_t, std::vector<uint32_t>> cu_mask_;
|
||||
|
||||
|
||||
@@ -266,6 +266,8 @@ global:
|
||||
hsa_ext_image_create_v2;
|
||||
hsa_ext_image_data_get_info_v2;
|
||||
hsa_ext_image_destroy_v2;
|
||||
hsa_amd_counted_queue_acquire;
|
||||
hsa_amd_counted_queue_release;
|
||||
local:
|
||||
*;
|
||||
};
|
||||
|
||||
@@ -273,6 +273,8 @@ struct AmdExtTable {
|
||||
decltype(hsa_amd_portable_export_dmabuf_v2)* hsa_amd_portable_export_dmabuf_v2_fn;
|
||||
decltype(hsa_amd_ais_file_write)* hsa_amd_ais_file_write_fn;
|
||||
decltype(hsa_amd_ais_file_read)* hsa_amd_ais_file_read_fn;
|
||||
decltype(hsa_amd_counted_queue_acquire)* hsa_amd_counted_queue_acquire_fn;
|
||||
decltype(hsa_amd_counted_queue_release)* hsa_amd_counted_queue_release_fn;
|
||||
};
|
||||
|
||||
// Table to export HSA Core Runtime Apis
|
||||
|
||||
@@ -58,7 +58,7 @@
|
||||
// Step Ids of the Api tables exported by Hsa Core Runtime
|
||||
#define HSA_API_TABLE_STEP_VERSION 0x01
|
||||
#define HSA_CORE_API_TABLE_STEP_VERSION 0x00
|
||||
#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x08
|
||||
#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x09
|
||||
#define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00
|
||||
#define HSA_IMAGE_API_TABLE_STEP_VERSION 0x01
|
||||
// Rocprofiler just checks HSA_MAGE_EXT_API_TABLE_STEP_VERSION
|
||||
|
||||
@@ -68,7 +68,7 @@
|
||||
* - 1.15 - hsa_amd_register_system_event_handler: HSA_AMD_SYSTEM_SHUTDOWN
|
||||
*/
|
||||
#define HSA_AMD_INTERFACE_VERSION_MAJOR 1
|
||||
#define HSA_AMD_INTERFACE_VERSION_MINOR 15
|
||||
#define HSA_AMD_INTERFACE_VERSION_MINOR 16
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -3666,6 +3666,18 @@ typedef enum {
|
||||
* The type of this attribute is uint64_t.
|
||||
*/
|
||||
HSA_AMD_QUEUE_INFO_DOORBELL_ID,
|
||||
/*
|
||||
* Returns how many times the underlying hardware queue has been shared.
|
||||
* @p value will be set to -1 if this queue was not allocated using
|
||||
* hsa_amd_counted_queue_acquire. The type of this attribute is uint32_t.
|
||||
*/
|
||||
HSA_QUEUE_INFO_USE_COUNT,
|
||||
/*
|
||||
* Returns a unique ID representing the HW resource used by a counted queue. Two queues
|
||||
* with the same HW_ID use the same underlying hardware queue. This query can be
|
||||
* used on counted and non-counted queues. The type of this attribute is uint32_t.
|
||||
*/
|
||||
HSA_QUEUE_INFO_HW_ID,
|
||||
} hsa_queue_info_attribute_t;
|
||||
|
||||
hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute,
|
||||
@@ -3760,6 +3772,81 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi
|
||||
uint64_t size, int64_t file_offset,
|
||||
uint64_t *size_copied, int32_t *status);
|
||||
|
||||
/**
|
||||
* @brief Create a queue that is limited by the GPU_MAX_HW_QUEUES environment variable.
|
||||
*
|
||||
* This underlying hardware queue returned by this function may be shared by other queues. For
|
||||
* each priority this function will create a new hardware queue as long as the number of queues
|
||||
* is less than GPU_MAX_HW_QUEUES. Once the GPU_MAX_HW_QUEUES limit is reached, this function will
|
||||
* stop creating new hardware queues and return a reference to an existing queue of the requested
|
||||
* @p priority instead. Each successful call will return a different @p queue handle.
|
||||
* The hsa_amd_queue_get_info API can be used to determine whether this queue is currently shared.
|
||||
*
|
||||
* When there are multiple eligible hardware queues available, the queue with the lowest
|
||||
* HSA_QUEUE_INFO_USE_COUNT will be returned.
|
||||
*
|
||||
* For each successful call, hsa_amd_counted_queue_release should be called to release the
|
||||
* HSA_QUEUE_INFO_USE_COUNT. After release, the queue handle becomes invalid and must not be used.
|
||||
*
|
||||
* hsa_amd_queue_set_priority and hsa_amd_queue_cu_set_mask cannot be used on counted queues.
|
||||
*
|
||||
* @param[in] agent Agent where to create the queue
|
||||
*
|
||||
* @param[in] type For future use. HSA_QUEUE_TYPE_MULTI is the only valid option.
|
||||
*
|
||||
* @param[in] priority Associated priority. The GPU_MAX_HW_QUEUES limit is counted for each priority
|
||||
*
|
||||
* @param[in] callback callback invoked by the HSA runtime for every asynchronous event related to
|
||||
* the newly created queue. May be NULL. The HSA runtime passes three arguments to the callback: a
|
||||
* code identifying the event that triggered the invocation, a pointer to the queue where the event
|
||||
* originated, and the application data.
|
||||
*
|
||||
* @param[in] data Application data that is passed to @p callback on every iteration. May be NULL.
|
||||
*
|
||||
* @param[in] flags For future use. Ignored.
|
||||
*
|
||||
* @param[out] queue Unique handle to reference the newly created queue.
|
||||
*
|
||||
* @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
|
||||
*
|
||||
* @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized.
|
||||
*
|
||||
* @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is failure to allocate the resources required
|
||||
* by the implementation.
|
||||
*
|
||||
* @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid or not a GPU agent.
|
||||
*
|
||||
* @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p type is not HSA_QUEUE_TYPE_MULTI.
|
||||
*
|
||||
* @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid priority or NULL queue pointer.
|
||||
*/
|
||||
hsa_status_t HSA_API hsa_amd_counted_queue_acquire(hsa_agent_t agent, hsa_queue_type_t type,
|
||||
hsa_amd_queue_priority_t priority,
|
||||
void (*callback)(hsa_status_t status,
|
||||
hsa_queue_t* source, void* data),
|
||||
void* data, uint64_t flags, hsa_queue_t** queue);
|
||||
|
||||
/**
|
||||
* @brief Release a counted queue and decrements its use count.
|
||||
*
|
||||
* Releases a queue that was previously acquired using hsa_amd_counted_queue_acquire.
|
||||
* Each call to this API decrements the internal use count HSA_QUEUE_INFO_USE_COUNT
|
||||
* of the underlying hardware. After this call, queue handle is invalid and must not be used.
|
||||
* Once created, the hardware queue is retained until hsa_shutdown is called to avoid costly
|
||||
* overhead of repeatedly creating new hardware queues, allowing them to be reused.
|
||||
*
|
||||
*
|
||||
* @param[in] queue Counted queue handle returned from hsa_amd_counted_queue_acquire.
|
||||
* Must not be NULL.
|
||||
*
|
||||
* @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
|
||||
* @retval ::HSA_STATUS_ERROR Invalid queue or queue was already released.
|
||||
* @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized.
|
||||
* @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The @p queue is NULL or previously released.
|
||||
* @retval ::HSA_STATUS_ERROR_INVALID_AGENT The queue's agent is invalid or not a GPU agent.
|
||||
*/
|
||||
hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue);
|
||||
|
||||
/**
|
||||
* @brief logging types
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user