rocr: Update Driver queue-related APIs
Update the user-mode driver queue APIs to leverage KMT types. Move queue-related calls to the core::Driver API.
Этот коммит содержится в:
коммит произвёл
Gutierrez, Tony
родитель
b3c48cc68c
Коммит
e03d44d742
@@ -353,11 +353,49 @@ hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) {
|
||||
return FreeKfdMemory(mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) const {
|
||||
hsa_status_t KfdDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
|
||||
void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
|
||||
HsaQueueResource& queue_resource) const {
|
||||
if (HSAKMT_CALL(hsaKmtCreateQueueExt(node_id, type, queue_pct, priority, sdma_engine_id,
|
||||
queue_addr, queue_size_bytes, event, &queue_resource)) !=
|
||||
HSAKMT_STATUS_SUCCESS) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const {
|
||||
hsa_status_t KfdDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
|
||||
if (HSAKMT_CALL(hsaKmtDestroyQueue(queue_id)) != HSAKMT_STATUS_SUCCESS) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t KfdDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, void* queue_addr,
|
||||
uint64_t queue_size, HsaEvent* event) const {
|
||||
if (HSAKMT_CALL(hsaKmtUpdateQueue(queue_id, queue_pct, priority, queue_addr, queue_size,
|
||||
event)) != HSAKMT_STATUS_SUCCESS) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t KfdDriver::SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
|
||||
uint32_t* queue_cu_mask) const {
|
||||
if (HSAKMT_CALL(hsaKmtSetQueueCUMask(queue_id, cu_mask_count, queue_cu_mask)) !=
|
||||
HSAKMT_STATUS_SUCCESS) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t KfdDriver::AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws,
|
||||
uint32_t* first_gws) const {
|
||||
if (HSAKMT_CALL(hsaKmtAllocQueueGWS(queue_id, num_gws, first_gws)) != HSAKMT_STATUS_SUCCESS) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -50,7 +50,6 @@
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "core/inc/amd_aie_aql_queue.h"
|
||||
#include "core/inc/amd_memory_region.h"
|
||||
#include "core/inc/runtime.h"
|
||||
#include "core/util/memory.h"
|
||||
@@ -173,10 +172,22 @@ hsa_status_t XdnaDriver::GetSystemProperties(HsaSystemProperties& sys_props) con
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::GetNodeProperties(HsaNodeProperties& node_props, uint32_t node_id) const {
|
||||
amdxdna_drm_query_aie_metadata aie_metadata = {};
|
||||
amdxdna_drm_get_info get_info_args = {};
|
||||
get_info_args.param = DRM_AMDXDNA_QUERY_AIE_METADATA;
|
||||
get_info_args.buffer_size = sizeof(aie_metadata);
|
||||
get_info_args.buffer = reinterpret_cast<uintptr_t>(&aie_metadata);
|
||||
|
||||
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_INFO, &get_info_args) < 0) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
// Right now can only target N-1 columns as that is the number of shim DMAs
|
||||
// in NPU1 devices.
|
||||
node_props.NumNeuralCores = (aie_metadata.cols - 1) * aie_metadata.rows;
|
||||
/// @todo XDNA driver currently only supports single-node AIE
|
||||
/// devices over PCIe. Update this once we can get topology
|
||||
/// information dynamically from the sysfs.
|
||||
node_props.NumNeuralCores = 1;
|
||||
node_props.NumIOLinks = 0;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -303,33 +314,47 @@ hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::CreateQueue(core::Queue &queue) const {
|
||||
if (!AieAqlQueue::IsType(&queue)) {
|
||||
hsa_status_t XdnaDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
|
||||
void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
|
||||
HsaQueueResource& queue_resource) const {
|
||||
queue_resource.QueueId = AMDXDNA_INVALID_CTX_HANDLE;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::DestroyQueue(HSA_QUEUEID queue_id) const {
|
||||
if (queue_id == AMDXDNA_INVALID_CTX_HANDLE) {
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
}
|
||||
|
||||
// Set the hw ctx handle of the queue to invalid to avoid incorrect destruction.
|
||||
auto& aie_queue = static_cast<AieAqlQueue&>(queue);
|
||||
aie_queue.SetHwCtxHandle(AMDXDNA_INVALID_BO_HANDLE);
|
||||
auto hw_ctx_handle = static_cast<uint32_t>(queue_id);
|
||||
amdxdna_drm_destroy_hwctx destroy_hwctx_args = {};
|
||||
destroy_hwctx_args.handle = hw_ctx_handle;
|
||||
|
||||
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &destroy_hwctx_args) < 0) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::DestroyQueue(core::Queue &queue) const {
|
||||
if (!AieAqlQueue::IsType(&queue)) {
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
}
|
||||
hsa_status_t XdnaDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, void* queue_addr,
|
||||
uint64_t queue_size, HsaEvent* event) const {
|
||||
// AIE doesn't support queue updates.
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
}
|
||||
|
||||
auto& aie_queue = static_cast<AieAqlQueue&>(queue);
|
||||
if (aie_queue.GetHwCtxHandle() != AMDXDNA_INVALID_BO_HANDLE) {
|
||||
amdxdna_drm_destroy_hwctx destroy_hwctx_args = {};
|
||||
destroy_hwctx_args.handle = aie_queue.GetHwCtxHandle();
|
||||
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &destroy_hwctx_args) < 0) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
}
|
||||
hsa_status_t XdnaDriver::SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
|
||||
uint32_t* queue_cu_mask) const {
|
||||
// AIE doesn't support queue CU masks.
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
hsa_status_t XdnaDriver::AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws,
|
||||
uint32_t* first_gws) const {
|
||||
// AIE doesn't support GWS.
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::ExportDMABuf(void *mem, size_t size, int *dmabuf_fd,
|
||||
@@ -470,10 +495,15 @@ hsa_status_t XdnaDriver::FreeDeviceHeap() {
|
||||
|
||||
hsa_status_t XdnaDriver::ExecCmdAndWait(const BOHandle& cmd_chain_bo_handle,
|
||||
const std::vector<uint32_t>& bo_handles,
|
||||
AieAqlQueue& aie_queue) {
|
||||
HSA_QUEUEID queue_id) {
|
||||
if (queue_id == AMDXDNA_INVALID_CTX_HANDLE) {
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
}
|
||||
|
||||
auto hw_ctx_handle = static_cast<uint32_t>(queue_id);
|
||||
// Submit command chain.
|
||||
amdxdna_drm_exec_cmd exec_cmd = {};
|
||||
exec_cmd.hwctx = aie_queue.GetHwCtxHandle();
|
||||
exec_cmd.hwctx = hw_ctx_handle;
|
||||
exec_cmd.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF;
|
||||
exec_cmd.cmd_handles = cmd_chain_bo_handle.handle;
|
||||
exec_cmd.args = reinterpret_cast<uint64_t>(bo_handles.data());
|
||||
@@ -484,7 +514,7 @@ hsa_status_t XdnaDriver::ExecCmdAndWait(const BOHandle& cmd_chain_bo_handle,
|
||||
|
||||
// Waiting for command chain to finish.
|
||||
amdxdna_drm_wait_cmd wait_cmd = {};
|
||||
wait_cmd.hwctx = aie_queue.GetHwCtxHandle();
|
||||
wait_cmd.hwctx = hw_ctx_handle;
|
||||
wait_cmd.timeout = DEFAULT_TIMEOUT_VAL;
|
||||
wait_cmd.seq = exec_cmd.seq;
|
||||
|
||||
@@ -579,7 +609,7 @@ hsa_status_t XdnaDriver::CreateCmdBO(uint32_t size, BOHandle& cmd_bo_handle) {
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uint32_t num_pkts,
|
||||
AieAqlQueue& aie_queue) {
|
||||
HSA_QUEUEID& queue_id, uint32_t num_core_tiles) {
|
||||
// Stores instruction and operand BOs.
|
||||
std::vector<uint32_t> bo_handles;
|
||||
|
||||
@@ -593,8 +623,13 @@ hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uin
|
||||
}
|
||||
});
|
||||
|
||||
if (queue_id == AMDXDNA_INVALID_CTX_HANDLE) {
|
||||
return HSA_STATUS_ERROR_INVALID_QUEUE;
|
||||
}
|
||||
|
||||
auto hw_ctx_handle = static_cast<uint32_t>(queue_id);
|
||||
// PDI cache. If the cache is updated, a new hardware context will be created for the queue.
|
||||
auto pdi_cache_it = hw_ctx_pdi_cache_map.find(aie_queue.GetHwCtxHandle());
|
||||
auto pdi_cache_it = hw_ctx_pdi_cache_map.find(hw_ctx_handle);
|
||||
auto pdi_cache = (pdi_cache_it != hw_ctx_pdi_cache_map.end()) ? pdi_cache_it->second : PDICache{};
|
||||
bool reconfigure_queue = false;
|
||||
|
||||
@@ -664,13 +699,13 @@ hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uin
|
||||
hw_ctx_pdi_cache_map.erase(pdi_cache_it);
|
||||
}
|
||||
|
||||
hsa_status_t status = ConfigHwCtx(pdi_cache, aie_queue);
|
||||
hsa_status_t status = ConfigHwCtx(pdi_cache, queue_id, num_core_tiles);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// Update cache mapping.
|
||||
hw_ctx_pdi_cache_map.emplace(aie_queue.GetHwCtxHandle(), pdi_cache);
|
||||
hw_ctx_pdi_cache_map.emplace(hw_ctx_handle, pdi_cache);
|
||||
}
|
||||
|
||||
// Creating a packet that contains the command chain
|
||||
@@ -707,7 +742,7 @@ hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uin
|
||||
bo_handles.erase(std::unique(bo_handles.begin(), bo_handles.end()), bo_handles.end());
|
||||
|
||||
// Executing all commands in the command chain
|
||||
status = ExecCmdAndWait(cmd_chain_bo_handle, bo_handles, aie_queue);
|
||||
status = ExecCmdAndWait(cmd_chain_bo_handle, bo_handles, queue_id);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
@@ -760,7 +795,7 @@ hsa_status_t XdnaDriver::SPMSetDestBuffer(uint32_t preferred_node_id, uint32_t s
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::IsModelEnabled(bool* enable) const {
|
||||
// AIE does not support streaming performance monitor.
|
||||
// AIE does not support a driver model.
|
||||
*enable = false;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -803,7 +838,8 @@ XdnaDriver::BOHandle XdnaDriver::FindBOHandle(void* mem) const {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue& aie_queue) {
|
||||
hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, HSA_QUEUEID& queue_id,
|
||||
uint32_t num_core_tiles) {
|
||||
const size_t config_cu_param_size =
|
||||
sizeof(amdxdna_hwctx_param_config_cu) + pdi_bo_handles.size() * sizeof(amdxdna_cu_config);
|
||||
|
||||
@@ -821,17 +857,20 @@ hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue
|
||||
xdna_config_cu_param->cu_configs[i].cu_func = default_cu_func;
|
||||
}
|
||||
|
||||
if (aie_queue.GetHwCtxHandle() != AMDXDNA_INVALID_BO_HANDLE) {
|
||||
auto hw_ctx_handle = static_cast<uint32_t>(queue_id);
|
||||
|
||||
if (hw_ctx_handle != AMDXDNA_INVALID_CTX_HANDLE) {
|
||||
// Destroy the hardware context
|
||||
// Note: we can do this because we have forced synchronization between
|
||||
// command chains. If we move to a more asynchronous model, we will need to
|
||||
// figure out how hardware context destruction works while applications
|
||||
// are running
|
||||
amdxdna_drm_destroy_hwctx destroy_hwctx_args = {};
|
||||
destroy_hwctx_args.handle = aie_queue.GetHwCtxHandle();
|
||||
destroy_hwctx_args.handle = hw_ctx_handle;
|
||||
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &destroy_hwctx_args) < 0) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
queue_id = AMDXDNA_INVALID_CTX_HANDLE;
|
||||
}
|
||||
|
||||
// Create the new hardware context
|
||||
@@ -840,7 +879,7 @@ hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue
|
||||
amdxdna_drm_create_hwctx create_hwctx_args = {};
|
||||
create_hwctx_args.qos_p = reinterpret_cast<uintptr_t>(&qos_info);
|
||||
create_hwctx_args.max_opc = 0x800;
|
||||
create_hwctx_args.num_tiles = aie_queue.GetAgent().GetNumCores();
|
||||
create_hwctx_args.num_tiles = num_core_tiles;
|
||||
|
||||
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &create_hwctx_args) < 0) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
@@ -857,7 +896,7 @@ hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
aie_queue.SetHwCtxHandle(create_hwctx_args.handle);
|
||||
queue_id = create_hwctx_args.handle;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -55,53 +55,45 @@ class AieAgent : public core::Agent {
|
||||
public:
|
||||
/// @brief AIE agent constructor.
|
||||
/// @param [in] node Node id.
|
||||
AieAgent(uint32_t node);
|
||||
AieAgent(uint32_t node, const HsaNodeProperties& node_props);
|
||||
|
||||
// @brief AIE agent destructor.
|
||||
~AieAgent();
|
||||
// @brief AIE agent destructor.
|
||||
~AieAgent();
|
||||
|
||||
hsa_status_t VisitRegion(bool include_peer,
|
||||
hsa_status_t (*callback)(hsa_region_t region,
|
||||
void *data),
|
||||
void *data) const;
|
||||
hsa_status_t IterateRegion(hsa_status_t (*callback)(hsa_region_t region,
|
||||
void *data),
|
||||
void *data) const override;
|
||||
hsa_status_t VisitRegion(bool include_peer,
|
||||
hsa_status_t (*callback)(hsa_region_t region, void* data),
|
||||
void* data) const;
|
||||
hsa_status_t IterateRegion(hsa_status_t (*callback)(hsa_region_t region, void* data),
|
||||
void* data) const override;
|
||||
|
||||
hsa_status_t IterateCache(hsa_status_t (*callback)(hsa_cache_t cache,
|
||||
void *data),
|
||||
void *value) const override;
|
||||
hsa_status_t IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, void* data),
|
||||
void* value) const override;
|
||||
|
||||
hsa_status_t IterateSupportedIsas(
|
||||
hsa_status_t (*callback)(hsa_isa_t isa, void* data),
|
||||
void* data) const override;
|
||||
hsa_status_t IterateSupportedIsas(hsa_status_t (*callback)(hsa_isa_t isa, void* data),
|
||||
void* data) const override;
|
||||
|
||||
hsa_status_t GetInfo(hsa_agent_info_t attribute, void *value) const override;
|
||||
hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override;
|
||||
|
||||
hsa_status_t QueueCreate(size_t size, hsa_queue_type32_t queue_type, uint64_t flags,
|
||||
core::HsaEventCallback event_callback, void* data,
|
||||
uint32_t private_segment_size, uint32_t group_segment_size,
|
||||
core::Queue** queue) override;
|
||||
hsa_status_t QueueCreate(size_t size, hsa_queue_type32_t queue_type, uint64_t flags,
|
||||
core::HsaEventCallback event_callback, void* data,
|
||||
uint32_t private_segment_size, uint32_t group_segment_size,
|
||||
core::Queue** queue) override;
|
||||
|
||||
// @brief Override from core::Agent.
|
||||
const std::vector<const core::Isa*>& supported_isas() const override {
|
||||
return supported_isas_;
|
||||
}
|
||||
// @brief Override from core::Agent.
|
||||
const std::vector<const core::Isa*>& supported_isas() const override { return supported_isas_; }
|
||||
|
||||
const std::vector<const core::MemoryRegion *> ®ions() const override {
|
||||
return regions_;
|
||||
}
|
||||
const std::vector<const core::MemoryRegion*>& regions() const override { return regions_; }
|
||||
|
||||
/// @brief Getter for the AIE system allocator.
|
||||
const std::function<void *(size_t size, size_t align,
|
||||
core::MemoryRegion::AllocateFlags flags)> &
|
||||
system_allocator() const {
|
||||
return system_allocator_;
|
||||
}
|
||||
/// @brief Getter for the AIE system allocator.
|
||||
const std::function<void*(size_t size, size_t align, core::MemoryRegion::AllocateFlags flags)>&
|
||||
system_allocator() const {
|
||||
return system_allocator_;
|
||||
}
|
||||
|
||||
/// @brief Getter for the AIE system deallocator.
|
||||
const std::function<void(void*)>& system_deallocator() const { return system_deallocator_; }
|
||||
|
||||
const HsaNodeProperties& properties() const { return node_props_; }
|
||||
// AIE agent methods.
|
||||
/// @brief Get the number of columns on this AIE agent.
|
||||
uint32_t GetNumCols() const { return num_cols_; }
|
||||
@@ -134,6 +126,7 @@ private:
|
||||
const uint32_t max_aql_size_ = 0x40;
|
||||
const uint32_t max_queues_ = 1;
|
||||
|
||||
const HsaNodeProperties node_props_;
|
||||
/// @brief Number of columns in the AIE array.
|
||||
uint32_t num_cols_ = 0;
|
||||
/// @brief Number of rows of core tiles in the AIE array. Not all rows in a
|
||||
|
||||
@@ -103,14 +103,6 @@ class AieAqlQueue : public core::Queue,
|
||||
/// @brief Returns the agent associated with this queue.
|
||||
AieAgent& GetAgent() { return agent_; }
|
||||
|
||||
/// @brief Sets the hardware context.
|
||||
void SetHwCtxHandle(uint32_t hw_ctx_handle) {
|
||||
hw_ctx_handle_ = hw_ctx_handle;
|
||||
}
|
||||
|
||||
/// @brief Returns the hardware context.
|
||||
uint32_t GetHwCtxHandle() const { return hw_ctx_handle_; }
|
||||
|
||||
// GPU-specific queue functions are unsupported.
|
||||
|
||||
hsa_status_t GetCUMasking(uint32_t num_cu_mask_count,
|
||||
@@ -141,17 +133,6 @@ class AieAqlQueue : public core::Queue,
|
||||
/// @brief Called when the doorbell is rung to submit all queued packets.
|
||||
void SubmitPackets();
|
||||
|
||||
/// @brief Handle for an application context on the AIE device.
|
||||
///
|
||||
/// Each user queue will have an associated context. This handle is assigned
|
||||
/// by the driver on context creation.
|
||||
///
|
||||
/// TODO: For now we support a single context that allocates all core tiles in
|
||||
/// the array. In the future we can make the number of tiles configurable so
|
||||
/// that multiple workloads with different core tile configurations can
|
||||
/// execute on the AIE agent at the same time.
|
||||
uint32_t hw_ctx_handle_ = std::numeric_limits<uint32_t>::max();
|
||||
|
||||
/// @brief Indicates if queue is active.
|
||||
std::atomic<bool> active_;
|
||||
static __forceinline int& rtti_id() {
|
||||
|
||||
@@ -2,24 +2,24 @@
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
//
|
||||
// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
@@ -29,7 +29,7 @@
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
@@ -47,8 +47,6 @@
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
|
||||
#include "hsakmt/hsakmt.h"
|
||||
|
||||
#include "core/inc/amd_gpu_agent.h"
|
||||
#include "core/inc/blit.h"
|
||||
#include "core/inc/runtime.h"
|
||||
|
||||
@@ -97,8 +97,17 @@ public:
|
||||
void **mem, size_t size,
|
||||
uint32_t node_id) override;
|
||||
hsa_status_t FreeMemory(void *mem, size_t size) override;
|
||||
hsa_status_t CreateQueue(core::Queue &queue) const override;
|
||||
hsa_status_t DestroyQueue(core::Queue &queue) const override;
|
||||
hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
|
||||
uint64_t queue_size_bytes, HsaEvent* event,
|
||||
HsaQueueResource& queue_resource) const override;
|
||||
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority,
|
||||
void* queue_addr, uint64_t queue_size, HsaEvent* event) const override;
|
||||
hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
|
||||
hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
|
||||
uint32_t* queue_cu_mask) const override;
|
||||
hsa_status_t AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws,
|
||||
uint32_t* first_gws) const override;
|
||||
hsa_status_t ExportDMABuf(void *mem, size_t size, int *dmabuf_fd,
|
||||
size_t *offset) override;
|
||||
hsa_status_t ImportDMABuf(int dmabuf_fd, core::Agent &agent,
|
||||
|
||||
@@ -95,8 +95,6 @@ class Queue;
|
||||
|
||||
namespace AMD {
|
||||
|
||||
class AieAqlQueue;
|
||||
|
||||
// @brief: Used to transform an address into a device address
|
||||
constexpr uint32_t DEV_ADDR_BASE = 0x04000000;
|
||||
constexpr uint32_t DEV_ADDR_OFFSET_MASK = 0x02FFFFFF;
|
||||
@@ -209,8 +207,17 @@ public:
|
||||
void **mem, size_t size,
|
||||
uint32_t node_id) override;
|
||||
hsa_status_t FreeMemory(void *mem, size_t size) override;
|
||||
hsa_status_t CreateQueue(core::Queue &queue) const override;
|
||||
hsa_status_t DestroyQueue(core::Queue &queue) const override;
|
||||
hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr,
|
||||
uint64_t queue_size_bytes, HsaEvent* event,
|
||||
HsaQueueResource& queue_resource) const override;
|
||||
hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority,
|
||||
void* queue_addr, uint64_t queue_size, HsaEvent* event) const override;
|
||||
hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override;
|
||||
hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
|
||||
uint32_t* queue_cu_mask) const override;
|
||||
hsa_status_t AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws,
|
||||
uint32_t* first_gws) const override;
|
||||
hsa_status_t ExportDMABuf(void *mem, size_t size, int *dmabuf_fd,
|
||||
size_t *offset) override;
|
||||
hsa_status_t ImportDMABuf(int dmabuf_fd, core::Agent &agent,
|
||||
@@ -223,7 +230,7 @@ public:
|
||||
|
||||
/// @brief Submits @p num_pkts packets in a command chain.
|
||||
hsa_status_t SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uint32_t num_pkts,
|
||||
AieAqlQueue& aie_queue);
|
||||
HSA_QUEUEID& queue_id, uint32_t num_core_tiles);
|
||||
|
||||
hsa_status_t SPMAcquire(uint32_t preferred_node_id) const override;
|
||||
hsa_status_t SPMRelease(uint32_t preferred_node_id) const override;
|
||||
@@ -243,7 +250,8 @@ public:
|
||||
BOHandle FindBOHandle(void* mem) const;
|
||||
|
||||
/// @brief Creates a new hardware context with the given PDI BO handles.
|
||||
hsa_status_t ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue& aie_queue);
|
||||
hsa_status_t ConfigHwCtx(const PDICache& pdi_bo_handles, HSA_QUEUEID& queue_id,
|
||||
uint32_t num_core_tiles);
|
||||
|
||||
hsa_status_t QueryDriverVersion();
|
||||
|
||||
@@ -274,7 +282,7 @@ public:
|
||||
/// @param bo_handles handles associated with the command
|
||||
/// @param aie_queue queue to submit to
|
||||
hsa_status_t ExecCmdAndWait(const BOHandle& cmd_chain_bo_handle,
|
||||
const std::vector<uint32_t>& bo_handles, AieAqlQueue& aie_queue);
|
||||
const std::vector<uint32_t>& bo_handles, HSA_QUEUEID queue_id);
|
||||
|
||||
/// TODO: Remove this in the future and rely on the core Runtime
|
||||
/// object to track handle allocations. Using the VMEM API for mapping XDNA
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
#ifndef HSA_RUNTME_CORE_INC_DRIVER_H_
|
||||
#define HSA_RUNTME_CORE_INC_DRIVER_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
|
||||
@@ -146,9 +147,53 @@ public:
|
||||
|
||||
virtual hsa_status_t FreeMemory(void *mem, size_t size) = 0;
|
||||
|
||||
virtual hsa_status_t CreateQueue(Queue &queue) const = 0;
|
||||
/// @brief Create an agent dispatch queue with user-mode access rights.
|
||||
/// @param[in] node_id Node ID of the agent on which the queue is being created.
|
||||
/// @param[in] type Queue's type.
|
||||
/// @param[in] queue_pct Maximum percentage of a queue's occupancy allowed.
|
||||
/// @param[in] priority Queue's priority for scheduling.
|
||||
/// @param[in] sdma_engine_id ID of the SDMA engine on which the queue is being created. Only used
|
||||
/// if @p type is one of the SDMA queue types.
|
||||
/// @param[in] queue_addr Address of the queue's ring buffer.
|
||||
/// @param[in] queue_size_bytes Size of the queue's ring buffer in bytes.
|
||||
/// @param[in] event HsaEvent for event-driven callbacks.
|
||||
/// @param[out] queue_resource Queue resource information populated by the driver.
|
||||
virtual hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id,
|
||||
void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event,
|
||||
HsaQueueResource& queue_resource) const = 0;
|
||||
|
||||
virtual hsa_status_t DestroyQueue(Queue &queue) const = 0;
|
||||
/// @brief Destroy a queue.
|
||||
/// @param queue_id Kernel-mode driver's assigned queue ID.
|
||||
virtual hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const = 0;
|
||||
|
||||
/// @brief Update a queue's properties.
|
||||
/// @param[in] queue_id Kernel-mode driver's assigned queue ID.
|
||||
/// @param[in] queue_pct Maximum percentage of a queue's occupancy allowed.
|
||||
/// @param[in] priority Queue's priority for scheduling.
|
||||
/// @param[in] queue_addr Queue's ring buffer base address.
|
||||
/// @param[in] queue_size_bytes Size of the queue's ring buffer in bytes.
|
||||
/// @param[in] event HsaEvent for event-driven callbacks.
|
||||
virtual hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct,
|
||||
HSA_QUEUE_PRIORITY priority, void* queue_addr,
|
||||
uint64_t queue_size_bytes, HsaEvent* event) const = 0;
|
||||
|
||||
/// @brief Set the CU mask for a queue.
|
||||
/// @details This sets the CU bitmask for a queue. The CU mask determines which CUs
|
||||
/// a queue's dispatches can target. Currently this is only supported for GPU devices.
|
||||
/// @param[in] queue_id Kernel-mode driver's assigned queue ID.
|
||||
/// @param[in] cu_mask_count Number of CU bits in the mask.
|
||||
/// @param[in] queue_cu_mask New CU mask for the queue.
|
||||
virtual hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count,
|
||||
uint32_t* queue_cu_mask) const = 0;
|
||||
|
||||
/// @brief Allocate global wave sync (GWS) resource for a queue. This is only supported for GPUs.
|
||||
/// GWS can be used to synchronize wavefronts across the entire GPU device.
|
||||
/// @param[in] queue_id Kernel-mode driver's assigned queue ID.
|
||||
/// @param[in] num_gws Number of GWS slots.
|
||||
/// @param[in] first_gws First GWS slot.
|
||||
virtual hsa_status_t AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws,
|
||||
uint32_t* first_gws) const = 0;
|
||||
|
||||
/// @brief Imports memory using dma-buf.
|
||||
///
|
||||
|
||||
@@ -55,10 +55,10 @@
|
||||
namespace rocr {
|
||||
namespace AMD {
|
||||
|
||||
AieAgent::AieAgent(uint32_t node)
|
||||
: core::Agent(core::Runtime::runtime_singleton_->AgentDriver(
|
||||
core::DriverType::XDNA),
|
||||
node, core::Agent::DeviceType::kAmdAieDevice) {
|
||||
AieAgent::AieAgent(uint32_t node, const HsaNodeProperties& node_props)
|
||||
: core::Agent(core::Runtime::runtime_singleton_->AgentDriver(core::DriverType::XDNA), node,
|
||||
core::Agent::DeviceType::kAmdAieDevice),
|
||||
node_props_(node_props) {
|
||||
InitRegionList();
|
||||
InitAllocators();
|
||||
GetAgentProperties();
|
||||
|
||||
@@ -102,16 +102,26 @@ AieAqlQueue::AieAqlQueue(core::SharedQueue* shared_queue, AieAgent* agent, size_
|
||||
signal_.queue_ptr = &amd_queue_;
|
||||
active_ = true;
|
||||
|
||||
auto &drv = static_cast<XdnaDriver &>(agent_.driver());
|
||||
drv.CreateQueue(*this);
|
||||
HsaQueueResource queue_resource = {};
|
||||
hsa_status_t status =
|
||||
agent_.driver().CreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 0, HSA_QUEUE_PRIORITY_NORMAL, 0,
|
||||
nullptr, queue_size_bytes_, nullptr, queue_resource);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
throw AMD::hsa_exception(status, "Failed to create a hardware context for an AIE queue.");
|
||||
}
|
||||
|
||||
queue_id_ = queue_resource.QueueId;
|
||||
amd_queue_.hsa_queue.id = GetQueueId();
|
||||
}
|
||||
|
||||
AieAqlQueue::~AieAqlQueue() {
|
||||
AieAqlQueue::Inactivate();
|
||||
|
||||
if (ring_buf_) agent_.system_deallocator()(ring_buf_);
|
||||
|
||||
if (shared_queue_) core::Runtime::runtime_singleton_->system_deallocator()(shared_queue_);
|
||||
if (ring_buf_) {
|
||||
agent_.system_deallocator()(ring_buf_);
|
||||
}
|
||||
if (shared_queue_) {
|
||||
core::Runtime::runtime_singleton_->system_deallocator()(shared_queue_);
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t AieAqlQueue::Inactivate() {
|
||||
@@ -119,9 +129,7 @@ hsa_status_t AieAqlQueue::Inactivate() {
|
||||
hsa_status_t status(HSA_STATUS_SUCCESS);
|
||||
|
||||
if (active) {
|
||||
auto &drv = static_cast<XdnaDriver &>(agent_.driver());
|
||||
status = drv.DestroyQueue(*this);
|
||||
hw_ctx_handle_ = std::numeric_limits<uint32_t>::max();
|
||||
agent_.driver().DestroyQueue(queue_id_);
|
||||
}
|
||||
|
||||
return status;
|
||||
@@ -237,7 +245,8 @@ void AieAqlQueue::SubmitPackets() {
|
||||
|
||||
// Call into the driver to submit from cur_id to write_dispatch_id.
|
||||
// Submitting the command chain might create a new hardware context.
|
||||
hsa_status_t status = driver.SubmitCmdChain(pkt, num_cont_start_cu_pkts, *this);
|
||||
hsa_status_t status = driver.SubmitCmdChain(pkt, num_cont_start_cu_pkts, queue_id_,
|
||||
agent_.properties().NumNeuralCores);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
assert(false && "Could not submit packets");
|
||||
}
|
||||
|
||||
@@ -195,10 +195,9 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
|
||||
// boolean flag
|
||||
const HSA_QUEUE_TYPE kQueueType_ = rec_eng >= 0 ? HSA_QUEUE_SDMA_BY_ENG_ID :
|
||||
(use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA);
|
||||
if (HSAKMT_STATUS_SUCCESS != HSAKMT_CALL(hsaKmtCreateQueueExt(agent_->node_id(), kQueueType_, 100,
|
||||
HSA_QUEUE_PRIORITY_MAXIMUM, rec_eng,
|
||||
queue_start_addr_, kQueueSize, NULL,
|
||||
&queue_resource_))) {
|
||||
if (agent_->driver().CreateQueue(agent_->node_id(), kQueueType_, 100, HSA_QUEUE_PRIORITY_MAXIMUM,
|
||||
rec_eng, queue_start_addr_, kQueueSize, nullptr,
|
||||
queue_resource_) != HSA_STATUS_SUCCESS) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
|
||||
@@ -173,7 +173,7 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnac
|
||||
}
|
||||
|
||||
void DiscoverAie(uint32_t node_id, HsaNodeProperties& node_prop) {
|
||||
AieAgent* aie = new AieAgent(node_id);
|
||||
AieAgent* aie = new AieAgent(node_id, node_prop);
|
||||
core::Runtime::runtime_singleton_->RegisterAgent(aie, true);
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user