rocr: Update Driver queue-related APIs

Update the user-mode driver queue APIs to leverage KMT types.

Move queue-related calls to the core::Driver API.
Этот коммит содержится в:
Tony Gutierrez
2025-06-16 12:04:29 -07:00
коммит произвёл Gutierrez, Tony
родитель b3c48cc68c
Коммит e03d44d742
12 изменённых файлов: 249 добавлений и 130 удалений
+40 -2
Просмотреть файл
@@ -353,11 +353,49 @@ hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) {
return FreeKfdMemory(mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
}
hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) const {
hsa_status_t KfdDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
HsaQueueResource& queue_resource) const {
if (HSAKMT_CALL(hsaKmtCreateQueueExt(node_id, type, queue_pct, priority, sdma_engine_id,
queue_addr, queue_size_bytes, event, &queue_resource)) !=
HSAKMT_STATUS_SUCCESS) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const {
hsa_status_t KfdDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
if (HSAKMT_CALL(hsaKmtDestroyQueue(queue_id)) != HSAKMT_STATUS_SUCCESS) {
return HSA_STATUS_ERROR;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t KfdDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, void* queue_addr,
uint64_t queue_size, HsaEvent* event) const {
if (HSAKMT_CALL(hsaKmtUpdateQueue(queue_id, queue_pct, priority, queue_addr, queue_size,
event)) != HSAKMT_STATUS_SUCCESS) {
return HSA_STATUS_ERROR;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t KfdDriver::SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
uint32_t* queue_cu_mask) const {
if (HSAKMT_CALL(hsaKmtSetQueueCUMask(queue_id, cu_mask_count, queue_cu_mask)) !=
HSAKMT_STATUS_SUCCESS) {
return HSA_STATUS_ERROR;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t KfdDriver::AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws,
uint32_t* first_gws) const {
if (HSAKMT_CALL(hsaKmtAllocQueueGWS(queue_id, num_gws, first_gws)) != HSAKMT_STATUS_SUCCESS) {
return HSA_STATUS_ERROR;
}
return HSA_STATUS_SUCCESS;
}
+73 -34
Просмотреть файл
@@ -50,7 +50,6 @@
#include <memory>
#include <string>
#include "core/inc/amd_aie_aql_queue.h"
#include "core/inc/amd_memory_region.h"
#include "core/inc/runtime.h"
#include "core/util/memory.h"
@@ -173,10 +172,22 @@ hsa_status_t XdnaDriver::GetSystemProperties(HsaSystemProperties& sys_props) con
}
hsa_status_t XdnaDriver::GetNodeProperties(HsaNodeProperties& node_props, uint32_t node_id) const {
amdxdna_drm_query_aie_metadata aie_metadata = {};
amdxdna_drm_get_info get_info_args = {};
get_info_args.param = DRM_AMDXDNA_QUERY_AIE_METADATA;
get_info_args.buffer_size = sizeof(aie_metadata);
get_info_args.buffer = reinterpret_cast<uintptr_t>(&aie_metadata);
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_INFO, &get_info_args) < 0) {
return HSA_STATUS_ERROR;
}
// Right now can only target N-1 columns as that is the number of shim DMAs
// in NPU1 devices.
node_props.NumNeuralCores = (aie_metadata.cols - 1) * aie_metadata.rows;
/// @todo XDNA driver currently only supports single-node AIE
/// devices over PCIe. Update this once we can get topology
/// information dynamically from the sysfs.
node_props.NumNeuralCores = 1;
node_props.NumIOLinks = 0;
return HSA_STATUS_SUCCESS;
}
@@ -303,33 +314,47 @@ hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
return HSA_STATUS_SUCCESS;
}
hsa_status_t XdnaDriver::CreateQueue(core::Queue &queue) const {
if (!AieAqlQueue::IsType(&queue)) {
hsa_status_t XdnaDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
HsaQueueResource& queue_resource) const {
queue_resource.QueueId = AMDXDNA_INVALID_CTX_HANDLE;
return HSA_STATUS_SUCCESS;
}
hsa_status_t XdnaDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
if (queue_id == AMDXDNA_INVALID_CTX_HANDLE) {
return HSA_STATUS_ERROR_INVALID_QUEUE;
}
// Set the hw ctx handle of the queue to invalid to avoid incorrect destruction.
auto& aie_queue = static_cast<AieAqlQueue&>(queue);
aie_queue.SetHwCtxHandle(AMDXDNA_INVALID_BO_HANDLE);
auto hw_ctx_handle = static_cast<uint32_t>(queue_id);
amdxdna_drm_destroy_hwctx destroy_hwctx_args = {};
destroy_hwctx_args.handle = hw_ctx_handle;
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &destroy_hwctx_args) < 0) {
return HSA_STATUS_ERROR;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t XdnaDriver::DestroyQueue(core::Queue &queue) const {
if (!AieAqlQueue::IsType(&queue)) {
return HSA_STATUS_ERROR_INVALID_QUEUE;
}
hsa_status_t XdnaDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, void* queue_addr,
uint64_t queue_size, HsaEvent* event) const {
// AIE doesn't support queue updates.
return HSA_STATUS_ERROR_INVALID_QUEUE;
}
auto& aie_queue = static_cast<AieAqlQueue&>(queue);
if (aie_queue.GetHwCtxHandle() != AMDXDNA_INVALID_BO_HANDLE) {
amdxdna_drm_destroy_hwctx destroy_hwctx_args = {};
destroy_hwctx_args.handle = aie_queue.GetHwCtxHandle();
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &destroy_hwctx_args) < 0) {
return HSA_STATUS_ERROR;
}
}
hsa_status_t XdnaDriver::SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
uint32_t* queue_cu_mask) const {
// AIE doesn't support queue CU masks.
return HSA_STATUS_ERROR_INVALID_QUEUE;
}
return HSA_STATUS_SUCCESS;
hsa_status_t XdnaDriver::AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws,
uint32_t* first_gws) const {
// AIE doesn't support GWS.
return HSA_STATUS_ERROR_INVALID_QUEUE;
}
hsa_status_t XdnaDriver::ExportDMABuf(void *mem, size_t size, int *dmabuf_fd,
@@ -470,10 +495,15 @@ hsa_status_t XdnaDriver::FreeDeviceHeap() {
hsa_status_t XdnaDriver::ExecCmdAndWait(const BOHandle& cmd_chain_bo_handle,
const std::vector<uint32_t>& bo_handles,
AieAqlQueue& aie_queue) {
HSA_QUEUEID queue_id) {
if (queue_id == AMDXDNA_INVALID_CTX_HANDLE) {
return HSA_STATUS_ERROR_INVALID_QUEUE;
}
auto hw_ctx_handle = static_cast<uint32_t>(queue_id);
// Submit command chain.
amdxdna_drm_exec_cmd exec_cmd = {};
exec_cmd.hwctx = aie_queue.GetHwCtxHandle();
exec_cmd.hwctx = hw_ctx_handle;
exec_cmd.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF;
exec_cmd.cmd_handles = cmd_chain_bo_handle.handle;
exec_cmd.args = reinterpret_cast<uint64_t>(bo_handles.data());
@@ -484,7 +514,7 @@ hsa_status_t XdnaDriver::ExecCmdAndWait(const BOHandle& cmd_chain_bo_handle,
// Waiting for command chain to finish.
amdxdna_drm_wait_cmd wait_cmd = {};
wait_cmd.hwctx = aie_queue.GetHwCtxHandle();
wait_cmd.hwctx = hw_ctx_handle;
wait_cmd.timeout = DEFAULT_TIMEOUT_VAL;
wait_cmd.seq = exec_cmd.seq;
@@ -579,7 +609,7 @@ hsa_status_t XdnaDriver::CreateCmdBO(uint32_t size, BOHandle& cmd_bo_handle) {
}
hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uint32_t num_pkts,
AieAqlQueue& aie_queue) {
HSA_QUEUEID& queue_id, uint32_t num_core_tiles) {
// Stores instruction and operand BOs.
std::vector<uint32_t> bo_handles;
@@ -593,8 +623,13 @@ hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uin
}
});
if (queue_id == AMDXDNA_INVALID_CTX_HANDLE) {
return HSA_STATUS_ERROR_INVALID_QUEUE;
}
auto hw_ctx_handle = static_cast<uint32_t>(queue_id);
// PDI cache. If the cache is updated, a new hardware context will be created for the queue.
auto pdi_cache_it = hw_ctx_pdi_cache_map.find(aie_queue.GetHwCtxHandle());
auto pdi_cache_it = hw_ctx_pdi_cache_map.find(hw_ctx_handle);
auto pdi_cache = (pdi_cache_it != hw_ctx_pdi_cache_map.end()) ? pdi_cache_it->second : PDICache{};
bool reconfigure_queue = false;
@@ -664,13 +699,13 @@ hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uin
hw_ctx_pdi_cache_map.erase(pdi_cache_it);
}
hsa_status_t status = ConfigHwCtx(pdi_cache, aie_queue);
hsa_status_t status = ConfigHwCtx(pdi_cache, queue_id, num_core_tiles);
if (status != HSA_STATUS_SUCCESS) {
return status;
}
// Update cache mapping.
hw_ctx_pdi_cache_map.emplace(aie_queue.GetHwCtxHandle(), pdi_cache);
hw_ctx_pdi_cache_map.emplace(hw_ctx_handle, pdi_cache);
}
// Creating a packet that contains the command chain
@@ -707,7 +742,7 @@ hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uin
bo_handles.erase(std::unique(bo_handles.begin(), bo_handles.end()), bo_handles.end());
// Executing all commands in the command chain
status = ExecCmdAndWait(cmd_chain_bo_handle, bo_handles, aie_queue);
status = ExecCmdAndWait(cmd_chain_bo_handle, bo_handles, queue_id);
if (status != HSA_STATUS_SUCCESS) {
return status;
}
@@ -760,7 +795,7 @@ hsa_status_t XdnaDriver::SPMSetDestBuffer(uint32_t preferred_node_id, uint32_t s
}
hsa_status_t XdnaDriver::IsModelEnabled(bool* enable) const {
// AIE does not support streaming performance monitor.
// AIE does not support a driver model.
*enable = false;
return HSA_STATUS_SUCCESS;
}
@@ -803,7 +838,8 @@ XdnaDriver::BOHandle XdnaDriver::FindBOHandle(void* mem) const {
return it->second;
}
hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue& aie_queue) {
hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, HSA_QUEUEID& queue_id,
uint32_t num_core_tiles) {
const size_t config_cu_param_size =
sizeof(amdxdna_hwctx_param_config_cu) + pdi_bo_handles.size() * sizeof(amdxdna_cu_config);
@@ -821,17 +857,20 @@ hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue
xdna_config_cu_param->cu_configs[i].cu_func = default_cu_func;
}
if (aie_queue.GetHwCtxHandle() != AMDXDNA_INVALID_BO_HANDLE) {
auto hw_ctx_handle = static_cast<uint32_t>(queue_id);
if (hw_ctx_handle != AMDXDNA_INVALID_CTX_HANDLE) {
// Destroy the hardware context
// Note: we can do this because we have forced synchronization between
// command chains. If we move to a more asynchronous model, we will need to
// figure out how hardware context destruction works while applications
// are running
amdxdna_drm_destroy_hwctx destroy_hwctx_args = {};
destroy_hwctx_args.handle = aie_queue.GetHwCtxHandle();
destroy_hwctx_args.handle = hw_ctx_handle;
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &destroy_hwctx_args) < 0) {
return HSA_STATUS_ERROR;
}
queue_id = AMDXDNA_INVALID_CTX_HANDLE;
}
// Create the new hardware context
@@ -840,7 +879,7 @@ hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue
amdxdna_drm_create_hwctx create_hwctx_args = {};
create_hwctx_args.qos_p = reinterpret_cast<uintptr_t>(&qos_info);
create_hwctx_args.max_opc = 0x800;
create_hwctx_args.num_tiles = aie_queue.GetAgent().GetNumCores();
create_hwctx_args.num_tiles = num_core_tiles;
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &create_hwctx_args) < 0) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
@@ -857,7 +896,7 @@ hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue
return HSA_STATUS_ERROR;
}
aie_queue.SetHwCtxHandle(create_hwctx_args.handle);
queue_id = create_hwctx_args.handle;
return HSA_STATUS_SUCCESS;
}
+27 -34
Просмотреть файл
@@ -55,53 +55,45 @@ class AieAgent : public core::Agent {
public:
/// @brief AIE agent constructor.
/// @param [in] node Node id.
AieAgent(uint32_t node);
AieAgent(uint32_t node, const HsaNodeProperties& node_props);
// @brief AIE agent destructor.
~AieAgent();
// @brief AIE agent destructor.
~AieAgent();
hsa_status_t VisitRegion(bool include_peer,
hsa_status_t (*callback)(hsa_region_t region,
void *data),
void *data) const;
hsa_status_t IterateRegion(hsa_status_t (*callback)(hsa_region_t region,
void *data),
void *data) const override;
hsa_status_t VisitRegion(bool include_peer,
hsa_status_t (*callback)(hsa_region_t region, void* data),
void* data) const;
hsa_status_t IterateRegion(hsa_status_t (*callback)(hsa_region_t region, void* data),
void* data) const override;
hsa_status_t IterateCache(hsa_status_t (*callback)(hsa_cache_t cache,
void *data),
void *value) const override;
hsa_status_t IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, void* data),
void* value) const override;
hsa_status_t IterateSupportedIsas(
hsa_status_t (*callback)(hsa_isa_t isa, void* data),
void* data) const override;
hsa_status_t IterateSupportedIsas(hsa_status_t (*callback)(hsa_isa_t isa, void* data),
void* data) const override;
hsa_status_t GetInfo(hsa_agent_info_t attribute, void *value) const override;
hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override;
hsa_status_t QueueCreate(size_t size, hsa_queue_type32_t queue_type, uint64_t flags,
core::HsaEventCallback event_callback, void* data,
uint32_t private_segment_size, uint32_t group_segment_size,
core::Queue** queue) override;
hsa_status_t QueueCreate(size_t size, hsa_queue_type32_t queue_type, uint64_t flags,
core::HsaEventCallback event_callback, void* data,
uint32_t private_segment_size, uint32_t group_segment_size,
core::Queue** queue) override;
// @brief Override from core::Agent.
const std::vector<const core::Isa*>& supported_isas() const override {
return supported_isas_;
}
// @brief Override from core::Agent.
const std::vector<const core::Isa*>& supported_isas() const override { return supported_isas_; }
const std::vector<const core::MemoryRegion *> &regions() const override {
return regions_;
}
const std::vector<const core::MemoryRegion*>& regions() const override { return regions_; }
/// @brief Getter for the AIE system allocator.
const std::function<void *(size_t size, size_t align,
core::MemoryRegion::AllocateFlags flags)> &
system_allocator() const {
return system_allocator_;
}
/// @brief Getter for the AIE system allocator.
const std::function<void*(size_t size, size_t align, core::MemoryRegion::AllocateFlags flags)>&
system_allocator() const {
return system_allocator_;
}
/// @brief Getter for the AIE system deallocator.
const std::function<void(void*)>& system_deallocator() const { return system_deallocator_; }
const HsaNodeProperties& properties() const { return node_props_; }
// AIE agent methods.
/// @brief Get the number of columns on this AIE agent.
uint32_t GetNumCols() const { return num_cols_; }
@@ -134,6 +126,7 @@ private:
const uint32_t max_aql_size_ = 0x40;
const uint32_t max_queues_ = 1;
const HsaNodeProperties node_props_;
/// @brief Number of columns in the AIE array.
uint32_t num_cols_ = 0;
/// @brief Number of rows of core tiles in the AIE array. Not all rows in a
-19
Просмотреть файл
@@ -103,14 +103,6 @@ class AieAqlQueue : public core::Queue,
/// @brief Returns the agent associated with this queue.
AieAgent& GetAgent() { return agent_; }
/// @brief Sets the hardware context.
void SetHwCtxHandle(uint32_t hw_ctx_handle) {
hw_ctx_handle_ = hw_ctx_handle;
}
/// @brief Returns the hardware context.
uint32_t GetHwCtxHandle() const { return hw_ctx_handle_; }
// GPU-specific queue functions are unsupported.
hsa_status_t GetCUMasking(uint32_t num_cu_mask_count,
@@ -141,17 +133,6 @@ class AieAqlQueue : public core::Queue,
/// @brief Called when the doorbell is rung to submit all queued packets.
void SubmitPackets();
/// @brief Handle for an application context on the AIE device.
///
/// Each user queue will have an associated context. This handle is assigned
/// by the driver on context creation.
///
/// TODO: For now we support a single context that allocates all core tiles in
/// the array. In the future we can make the number of tiles configurable so
/// that multiple workloads with different core tile configurations can
/// execute on the AIE agent at the same time.
uint32_t hw_ctx_handle_ = std::numeric_limits<uint32_t>::max();
/// @brief Indicates if queue is active.
std::atomic<bool> active_;
static __forceinline int& rtti_id() {
+9 -11
Просмотреть файл
@@ -2,24 +2,24 @@
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
//
// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
//
// AMD Research and AMD HSA Software Development
//
//
// Advanced Micro Devices, Inc.
//
//
// www.amd.com
//
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
@@ -47,8 +47,6 @@
#include <stdint.h>
#include <vector>
#include "hsakmt/hsakmt.h"
#include "core/inc/amd_gpu_agent.h"
#include "core/inc/blit.h"
#include "core/inc/runtime.h"
+11 -2
Просмотреть файл
@@ -97,8 +97,17 @@ public:
void **mem, size_t size,
uint32_t node_id) override;
hsa_status_t FreeMemory(void *mem, size_t size) override;
hsa_status_t CreateQueue(core::Queue &queue) const override;
hsa_status_t DestroyQueue(core::Queue &queue) const override;
hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
uint64_t queue_size_bytes, HsaEvent* event,
HsaQueueResource& queue_resource) const override;
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority,
void* queue_addr, uint64_t queue_size, HsaEvent* event) const override;
hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
uint32_t* queue_cu_mask) const override;
hsa_status_t AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws,
uint32_t* first_gws) const override;
hsa_status_t ExportDMABuf(void *mem, size_t size, int *dmabuf_fd,
size_t *offset) override;
hsa_status_t ImportDMABuf(int dmabuf_fd, core::Agent &agent,
+15 -7
Просмотреть файл
@@ -95,8 +95,6 @@ class Queue;
namespace AMD {
class AieAqlQueue;
// @brief: Used to transform an address into a device address
constexpr uint32_t DEV_ADDR_BASE = 0x04000000;
constexpr uint32_t DEV_ADDR_OFFSET_MASK = 0x02FFFFFF;
@@ -209,8 +207,17 @@ public:
void **mem, size_t size,
uint32_t node_id) override;
hsa_status_t FreeMemory(void *mem, size_t size) override;
hsa_status_t CreateQueue(core::Queue &queue) const override;
hsa_status_t DestroyQueue(core::Queue &queue) const override;
hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
uint64_t queue_size_bytes, HsaEvent* event,
HsaQueueResource& queue_resource) const override;
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority,
void* queue_addr, uint64_t queue_size, HsaEvent* event) const override;
hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
uint32_t* queue_cu_mask) const override;
hsa_status_t AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws,
uint32_t* first_gws) const override;
hsa_status_t ExportDMABuf(void *mem, size_t size, int *dmabuf_fd,
size_t *offset) override;
hsa_status_t ImportDMABuf(int dmabuf_fd, core::Agent &agent,
@@ -223,7 +230,7 @@ public:
/// @brief Submits @p num_pkts packets in a command chain.
hsa_status_t SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uint32_t num_pkts,
AieAqlQueue& aie_queue);
HSA_QUEUEID& queue_id, uint32_t num_core_tiles);
hsa_status_t SPMAcquire(uint32_t preferred_node_id) const override;
hsa_status_t SPMRelease(uint32_t preferred_node_id) const override;
@@ -243,7 +250,8 @@ public:
BOHandle FindBOHandle(void* mem) const;
/// @brief Creates a new hardware context with the given PDI BO handles.
hsa_status_t ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue& aie_queue);
hsa_status_t ConfigHwCtx(const PDICache& pdi_bo_handles, HSA_QUEUEID& queue_id,
uint32_t num_core_tiles);
hsa_status_t QueryDriverVersion();
@@ -274,7 +282,7 @@ public:
/// @param bo_handles handles associated with the command
/// @param aie_queue queue to submit to
hsa_status_t ExecCmdAndWait(const BOHandle& cmd_chain_bo_handle,
const std::vector<uint32_t>& bo_handles, AieAqlQueue& aie_queue);
const std::vector<uint32_t>& bo_handles, HSA_QUEUEID queue_id);
/// TODO: Remove this in the future and rely on the core Runtime
/// object to track handle allocations. Using the VMEM API for mapping XDNA
+47 -2
Просмотреть файл
@@ -43,6 +43,7 @@
#ifndef HSA_RUNTME_CORE_INC_DRIVER_H_
#define HSA_RUNTME_CORE_INC_DRIVER_H_
#include <cstdint>
#include <limits>
#include <string>
@@ -146,9 +147,53 @@ public:
virtual hsa_status_t FreeMemory(void *mem, size_t size) = 0;
virtual hsa_status_t CreateQueue(Queue &queue) const = 0;
/// @brief Create an agent dispatch queue with user-mode access rights.
/// @param[in] node_id Node ID of the agent on which the queue is being created.
/// @param[in] type Queue's type.
/// @param[in] queue_pct Maximum percentage of a queue's occupancy allowed.
/// @param[in] priority Queue's priority for scheduling.
/// @param[in] sdma_engine_id ID of the SDMA engine on which the queue is being created. Only used
/// if @p type is one of the SDMA queue types.
/// @param[in] queue_addr Address of the queue's ring buffer.
/// @param[in] queue_size_bytes Size of the queue's ring buffer in bytes.
/// @param[in] event HsaEvent for event-driven callbacks.
/// @param[out] queue_resource Queue resource information populated by the driver.
virtual hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
HsaQueueResource& queue_resource) const = 0;
virtual hsa_status_t DestroyQueue(Queue &queue) const = 0;
/// @brief Destroy a queue.
/// @param queue_id Kernel-mode driver's assigned queue ID.
virtual hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const = 0;
/// @brief Update a queue's properties.
/// @param[in] queue_id Kernel-mode driver's assigned queue ID.
/// @param[in] queue_pct Maximum percentage of a queue's occupancy allowed.
/// @param[in] priority Queue's priority for scheduling.
/// @param[in] queue_addr Queue's ring buffer base address.
/// @param[in] queue_size_bytes Size of the queue's ring buffer in bytes.
/// @param[in] event HsaEvent for event-driven callbacks.
virtual hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
HSA_QUEUE_PRIORITY priority, void* queue_addr,
uint64_t queue_size_bytes, HsaEvent* event) const = 0;
/// @brief Set the CU mask for a queue.
/// @details This sets the CU bitmask for a queue. The CU mask determines which CUs
/// a queue's dispatches can target. Currently this is only supported for GPU devices.
/// @param[in] queue_id Kernel-mode driver's assigned queue ID.
/// @param[in] cu_mask_count Number of CU bits in the mask.
/// @param[in] queue_cu_mask New CU mask for the queue.
virtual hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
uint32_t* queue_cu_mask) const = 0;
/// @brief Allocate global wave sync (GWS) resource for a queue. This is only supported for GPUs.
/// GWS can be used to synchronize wavefronts across the entire GPU device.
/// @param[in] queue_id Kernel-mode driver's assigned queue ID.
/// @param[in] num_gws Number of GWS slots.
/// @param[in] first_gws First GWS slot.
virtual hsa_status_t AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws,
uint32_t* first_gws) const = 0;
/// @brief Imports memory using dma-buf.
///
+4 -4
Просмотреть файл
@@ -55,10 +55,10 @@
namespace rocr {
namespace AMD {
AieAgent::AieAgent(uint32_t node)
: core::Agent(core::Runtime::runtime_singleton_->AgentDriver(
core::DriverType::XDNA),
node, core::Agent::DeviceType::kAmdAieDevice) {
AieAgent::AieAgent(uint32_t node, const HsaNodeProperties& node_props)
: core::Agent(core::Runtime::runtime_singleton_->AgentDriver(core::DriverType::XDNA), node,
core::Agent::DeviceType::kAmdAieDevice),
node_props_(node_props) {
InitRegionList();
InitAllocators();
GetAgentProperties();
+19 -10
Просмотреть файл
@@ -102,16 +102,26 @@ AieAqlQueue::AieAqlQueue(core::SharedQueue* shared_queue, AieAgent* agent, size_
signal_.queue_ptr = &amd_queue_;
active_ = true;
auto &drv = static_cast<XdnaDriver &>(agent_.driver());
drv.CreateQueue(*this);
HsaQueueResource queue_resource = {};
hsa_status_t status =
agent_.driver().CreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 0, HSA_QUEUE_PRIORITY_NORMAL, 0,
nullptr, queue_size_bytes_, nullptr, queue_resource);
if (status != HSA_STATUS_SUCCESS) {
throw AMD::hsa_exception(status, "Failed to create a hardware context for an AIE queue.");
}
queue_id_ = queue_resource.QueueId;
amd_queue_.hsa_queue.id = GetQueueId();
}
AieAqlQueue::~AieAqlQueue() {
AieAqlQueue::Inactivate();
if (ring_buf_) agent_.system_deallocator()(ring_buf_);
if (shared_queue_) core::Runtime::runtime_singleton_->system_deallocator()(shared_queue_);
if (ring_buf_) {
agent_.system_deallocator()(ring_buf_);
}
if (shared_queue_) {
core::Runtime::runtime_singleton_->system_deallocator()(shared_queue_);
}
}
hsa_status_t AieAqlQueue::Inactivate() {
@@ -119,9 +129,7 @@ hsa_status_t AieAqlQueue::Inactivate() {
hsa_status_t status(HSA_STATUS_SUCCESS);
if (active) {
auto &drv = static_cast<XdnaDriver &>(agent_.driver());
status = drv.DestroyQueue(*this);
hw_ctx_handle_ = std::numeric_limits<uint32_t>::max();
agent_.driver().DestroyQueue(queue_id_);
}
return status;
@@ -237,7 +245,8 @@ void AieAqlQueue::SubmitPackets() {
// Call into the driver to submit from cur_id to write_dispatch_id.
// Submitting the command chain might create a new hardware context.
hsa_status_t status = driver.SubmitCmdChain(pkt, num_cont_start_cu_pkts, *this);
hsa_status_t status = driver.SubmitCmdChain(pkt, num_cont_start_cu_pkts, queue_id_,
agent_.properties().NumNeuralCores);
if (status != HSA_STATUS_SUCCESS) {
assert(false && "Could not submit packets");
}
+3 -4
Просмотреть файл
@@ -195,10 +195,9 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
// boolean flag
const HSA_QUEUE_TYPE kQueueType_ = rec_eng >= 0 ? HSA_QUEUE_SDMA_BY_ENG_ID :
(use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA);
if (HSAKMT_STATUS_SUCCESS != HSAKMT_CALL(hsaKmtCreateQueueExt(agent_->node_id(), kQueueType_, 100,
HSA_QUEUE_PRIORITY_MAXIMUM, rec_eng,
queue_start_addr_, kQueueSize, NULL,
&queue_resource_))) {
if (agent_->driver().CreateQueue(agent_->node_id(), kQueueType_, 100, HSA_QUEUE_PRIORITY_MAXIMUM,
rec_eng, queue_start_addr_, kQueueSize, nullptr,
queue_resource_) != HSA_STATUS_SUCCESS) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
+1 -1
Просмотреть файл
@@ -173,7 +173,7 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnac
}
void DiscoverAie(uint32_t node_id, HsaNodeProperties& node_prop) {
AieAgent* aie = new AieAgent(node_id);
AieAgent* aie = new AieAgent(node_id, node_prop);
core::Runtime::runtime_singleton_->RegisterAgent(aie, true);
}