From e03d44d742670ab4020437b59b9d73bbf56adc8b Mon Sep 17 00:00:00 2001 From: Tony Gutierrez Date: Mon, 16 Jun 2025 12:04:29 -0700 Subject: [PATCH] rocr: Update Driver queue-related APIs Update the user-mode driver queue APIs to leverage KMT types. Move queue-related calls to the core::Driver API. --- .../core/driver/kfd/amd_kfd_driver.cpp | 42 ++++++- .../core/driver/xdna/amd_xdna_driver.cpp | 107 ++++++++++++------ runtime/hsa-runtime/core/inc/amd_aie_agent.h | 61 +++++----- .../hsa-runtime/core/inc/amd_aie_aql_queue.h | 19 ---- runtime/hsa-runtime/core/inc/amd_blit_sdma.h | 20 ++-- runtime/hsa-runtime/core/inc/amd_kfd_driver.h | 13 ++- .../hsa-runtime/core/inc/amd_xdna_driver.h | 22 ++-- runtime/hsa-runtime/core/inc/driver.h | 49 +++++++- .../core/runtime/amd_aie_agent.cpp | 8 +- .../core/runtime/amd_aie_aql_queue.cpp | 29 +++-- .../core/runtime/amd_blit_sdma.cpp | 7 +- .../hsa-runtime/core/runtime/amd_topology.cpp | 2 +- 12 files changed, 249 insertions(+), 130 deletions(-) diff --git a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp index 1c05d95f5e..74f2c69f80 100644 --- a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp +++ b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp @@ -353,11 +353,49 @@ hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) { return FreeKfdMemory(mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; } -hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) const { +hsa_status_t KfdDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct, + HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, + void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event, + HsaQueueResource& queue_resource) const { + if (HSAKMT_CALL(hsaKmtCreateQueueExt(node_id, type, queue_pct, priority, sdma_engine_id, + queue_addr, queue_size_bytes, event, &queue_resource)) != + HSAKMT_STATUS_SUCCESS) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } return HSA_STATUS_SUCCESS; } -hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const { +hsa_status_t KfdDriver::DestroyQueue(HSA_QUEUEID queue_id) const { + if (HSAKMT_CALL(hsaKmtDestroyQueue(queue_id)) != HSAKMT_STATUS_SUCCESS) { + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t KfdDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, + HSA_QUEUE_PRIORITY priority, void* queue_addr, + uint64_t queue_size, HsaEvent* event) const { + if (HSAKMT_CALL(hsaKmtUpdateQueue(queue_id, queue_pct, priority, queue_addr, queue_size, + event)) != HSAKMT_STATUS_SUCCESS) { + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t KfdDriver::SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count, + uint32_t* queue_cu_mask) const { + if (HSAKMT_CALL(hsaKmtSetQueueCUMask(queue_id, cu_mask_count, queue_cu_mask)) != + HSAKMT_STATUS_SUCCESS) { + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t KfdDriver::AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws, + uint32_t* first_gws) const { + if (HSAKMT_CALL(hsaKmtAllocQueueGWS(queue_id, num_gws, first_gws)) != HSAKMT_STATUS_SUCCESS) { + return HSA_STATUS_ERROR; + } return HSA_STATUS_SUCCESS; } diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index 0a398e60f2..ef4ab7696e 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -50,7 +50,6 @@ #include #include -#include "core/inc/amd_aie_aql_queue.h" #include "core/inc/amd_memory_region.h" #include "core/inc/runtime.h" #include "core/util/memory.h" @@ -173,10 +172,22 @@ hsa_status_t XdnaDriver::GetSystemProperties(HsaSystemProperties& sys_props) con } hsa_status_t XdnaDriver::GetNodeProperties(HsaNodeProperties& node_props, uint32_t node_id) const { + amdxdna_drm_query_aie_metadata aie_metadata = {}; + amdxdna_drm_get_info get_info_args = {}; + get_info_args.param = DRM_AMDXDNA_QUERY_AIE_METADATA; + get_info_args.buffer_size = sizeof(aie_metadata); + get_info_args.buffer = reinterpret_cast(&aie_metadata); + + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_INFO, &get_info_args) < 0) { + return HSA_STATUS_ERROR; + } + + // Right now can only target N-1 columns as that is the number of shim DMAs + // in NPU1 devices. + node_props.NumNeuralCores = (aie_metadata.cols - 1) * aie_metadata.rows; /// @todo XDNA driver currently only supports single-node AIE /// devices over PCIe. Update this once we can get topology /// information dynamically from the sysfs. - node_props.NumNeuralCores = 1; node_props.NumIOLinks = 0; return HSA_STATUS_SUCCESS; } @@ -303,33 +314,47 @@ hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) { return HSA_STATUS_SUCCESS; } -hsa_status_t XdnaDriver::CreateQueue(core::Queue &queue) const { - if (!AieAqlQueue::IsType(&queue)) { +hsa_status_t XdnaDriver::CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct, + HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, + void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event, + HsaQueueResource& queue_resource) const { + queue_resource.QueueId = AMDXDNA_INVALID_CTX_HANDLE; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t XdnaDriver::DestroyQueue(HSA_QUEUEID queue_id) const { + if (queue_id == AMDXDNA_INVALID_CTX_HANDLE) { return HSA_STATUS_ERROR_INVALID_QUEUE; } - // Set the hw ctx handle of the queue to invalid to avoid incorrect destruction. - auto& aie_queue = static_cast(queue); - aie_queue.SetHwCtxHandle(AMDXDNA_INVALID_BO_HANDLE); + auto hw_ctx_handle = static_cast(queue_id); + amdxdna_drm_destroy_hwctx destroy_hwctx_args = {}; + destroy_hwctx_args.handle = hw_ctx_handle; + + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &destroy_hwctx_args) < 0) { + return HSA_STATUS_ERROR; + } return HSA_STATUS_SUCCESS; } -hsa_status_t XdnaDriver::DestroyQueue(core::Queue &queue) const { - if (!AieAqlQueue::IsType(&queue)) { - return HSA_STATUS_ERROR_INVALID_QUEUE; - } +hsa_status_t XdnaDriver::UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, + HSA_QUEUE_PRIORITY priority, void* queue_addr, + uint64_t queue_size, HsaEvent* event) const { + // AIE doesn't support queue updates. + return HSA_STATUS_ERROR_INVALID_QUEUE; +} - auto& aie_queue = static_cast(queue); - if (aie_queue.GetHwCtxHandle() != AMDXDNA_INVALID_BO_HANDLE) { - amdxdna_drm_destroy_hwctx destroy_hwctx_args = {}; - destroy_hwctx_args.handle = aie_queue.GetHwCtxHandle(); - if (ioctl(fd_, DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &destroy_hwctx_args) < 0) { - return HSA_STATUS_ERROR; - } - } +hsa_status_t XdnaDriver::SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count, + uint32_t* queue_cu_mask) const { + // AIE doesn't support queue CU masks. + return HSA_STATUS_ERROR_INVALID_QUEUE; +} - return HSA_STATUS_SUCCESS; +hsa_status_t XdnaDriver::AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws, + uint32_t* first_gws) const { + // AIE doesn't support GWS. + return HSA_STATUS_ERROR_INVALID_QUEUE; } hsa_status_t XdnaDriver::ExportDMABuf(void *mem, size_t size, int *dmabuf_fd, @@ -470,10 +495,15 @@ hsa_status_t XdnaDriver::FreeDeviceHeap() { hsa_status_t XdnaDriver::ExecCmdAndWait(const BOHandle& cmd_chain_bo_handle, const std::vector& bo_handles, - AieAqlQueue& aie_queue) { + HSA_QUEUEID queue_id) { + if (queue_id == AMDXDNA_INVALID_CTX_HANDLE) { + return HSA_STATUS_ERROR_INVALID_QUEUE; + } + + auto hw_ctx_handle = static_cast(queue_id); // Submit command chain. amdxdna_drm_exec_cmd exec_cmd = {}; - exec_cmd.hwctx = aie_queue.GetHwCtxHandle(); + exec_cmd.hwctx = hw_ctx_handle; exec_cmd.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF; exec_cmd.cmd_handles = cmd_chain_bo_handle.handle; exec_cmd.args = reinterpret_cast(bo_handles.data()); @@ -484,7 +514,7 @@ hsa_status_t XdnaDriver::ExecCmdAndWait(const BOHandle& cmd_chain_bo_handle, // Waiting for command chain to finish. amdxdna_drm_wait_cmd wait_cmd = {}; - wait_cmd.hwctx = aie_queue.GetHwCtxHandle(); + wait_cmd.hwctx = hw_ctx_handle; wait_cmd.timeout = DEFAULT_TIMEOUT_VAL; wait_cmd.seq = exec_cmd.seq; @@ -579,7 +609,7 @@ hsa_status_t XdnaDriver::CreateCmdBO(uint32_t size, BOHandle& cmd_bo_handle) { } hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uint32_t num_pkts, - AieAqlQueue& aie_queue) { + HSA_QUEUEID& queue_id, uint32_t num_core_tiles) { // Stores instruction and operand BOs. std::vector bo_handles; @@ -593,8 +623,13 @@ hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uin } }); + if (queue_id == AMDXDNA_INVALID_CTX_HANDLE) { + return HSA_STATUS_ERROR_INVALID_QUEUE; + } + + auto hw_ctx_handle = static_cast(queue_id); // PDI cache. If the cache is updated, a new hardware context will be created for the queue. - auto pdi_cache_it = hw_ctx_pdi_cache_map.find(aie_queue.GetHwCtxHandle()); + auto pdi_cache_it = hw_ctx_pdi_cache_map.find(hw_ctx_handle); auto pdi_cache = (pdi_cache_it != hw_ctx_pdi_cache_map.end()) ? pdi_cache_it->second : PDICache{}; bool reconfigure_queue = false; @@ -664,13 +699,13 @@ hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uin hw_ctx_pdi_cache_map.erase(pdi_cache_it); } - hsa_status_t status = ConfigHwCtx(pdi_cache, aie_queue); + hsa_status_t status = ConfigHwCtx(pdi_cache, queue_id, num_core_tiles); if (status != HSA_STATUS_SUCCESS) { return status; } // Update cache mapping. - hw_ctx_pdi_cache_map.emplace(aie_queue.GetHwCtxHandle(), pdi_cache); + hw_ctx_pdi_cache_map.emplace(hw_ctx_handle, pdi_cache); } // Creating a packet that contains the command chain @@ -707,7 +742,7 @@ hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uin bo_handles.erase(std::unique(bo_handles.begin(), bo_handles.end()), bo_handles.end()); // Executing all commands in the command chain - status = ExecCmdAndWait(cmd_chain_bo_handle, bo_handles, aie_queue); + status = ExecCmdAndWait(cmd_chain_bo_handle, bo_handles, queue_id); if (status != HSA_STATUS_SUCCESS) { return status; } @@ -760,7 +795,7 @@ hsa_status_t XdnaDriver::SPMSetDestBuffer(uint32_t preferred_node_id, uint32_t s } hsa_status_t XdnaDriver::IsModelEnabled(bool* enable) const { - // AIE does not support streaming performance monitor. + // AIE does not support a driver model. *enable = false; return HSA_STATUS_SUCCESS; } @@ -803,7 +838,8 @@ XdnaDriver::BOHandle XdnaDriver::FindBOHandle(void* mem) const { return it->second; } -hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue& aie_queue) { +hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, HSA_QUEUEID& queue_id, + uint32_t num_core_tiles) { const size_t config_cu_param_size = sizeof(amdxdna_hwctx_param_config_cu) + pdi_bo_handles.size() * sizeof(amdxdna_cu_config); @@ -821,17 +857,20 @@ hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue xdna_config_cu_param->cu_configs[i].cu_func = default_cu_func; } - if (aie_queue.GetHwCtxHandle() != AMDXDNA_INVALID_BO_HANDLE) { + auto hw_ctx_handle = static_cast(queue_id); + + if (hw_ctx_handle != AMDXDNA_INVALID_CTX_HANDLE) { // Destroy the hardware context // Note: we can do this because we have forced synchronization between // command chains. If we move to a more asynchronous model, we will need to // figure out how hardware context destruction works while applications // are running amdxdna_drm_destroy_hwctx destroy_hwctx_args = {}; - destroy_hwctx_args.handle = aie_queue.GetHwCtxHandle(); + destroy_hwctx_args.handle = hw_ctx_handle; if (ioctl(fd_, DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &destroy_hwctx_args) < 0) { return HSA_STATUS_ERROR; } + queue_id = AMDXDNA_INVALID_CTX_HANDLE; } // Create the new hardware context @@ -840,7 +879,7 @@ hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue amdxdna_drm_create_hwctx create_hwctx_args = {}; create_hwctx_args.qos_p = reinterpret_cast(&qos_info); create_hwctx_args.max_opc = 0x800; - create_hwctx_args.num_tiles = aie_queue.GetAgent().GetNumCores(); + create_hwctx_args.num_tiles = num_core_tiles; if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &create_hwctx_args) < 0) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; @@ -857,7 +896,7 @@ hsa_status_t XdnaDriver::ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue return HSA_STATUS_ERROR; } - aie_queue.SetHwCtxHandle(create_hwctx_args.handle); + queue_id = create_hwctx_args.handle; return HSA_STATUS_SUCCESS; } diff --git a/runtime/hsa-runtime/core/inc/amd_aie_agent.h b/runtime/hsa-runtime/core/inc/amd_aie_agent.h index 938d6182fb..a8998a76a0 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_agent.h @@ -55,53 +55,45 @@ class AieAgent : public core::Agent { public: /// @brief AIE agent constructor. /// @param [in] node Node id. - AieAgent(uint32_t node); + AieAgent(uint32_t node, const HsaNodeProperties& node_props); - // @brief AIE agent destructor. - ~AieAgent(); + // @brief AIE agent destructor. + ~AieAgent(); - hsa_status_t VisitRegion(bool include_peer, - hsa_status_t (*callback)(hsa_region_t region, - void *data), - void *data) const; - hsa_status_t IterateRegion(hsa_status_t (*callback)(hsa_region_t region, - void *data), - void *data) const override; + hsa_status_t VisitRegion(bool include_peer, + hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data) const; + hsa_status_t IterateRegion(hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data) const override; - hsa_status_t IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, - void *data), - void *value) const override; + hsa_status_t IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, void* data), + void* value) const override; - hsa_status_t IterateSupportedIsas( - hsa_status_t (*callback)(hsa_isa_t isa, void* data), - void* data) const override; + hsa_status_t IterateSupportedIsas(hsa_status_t (*callback)(hsa_isa_t isa, void* data), + void* data) const override; - hsa_status_t GetInfo(hsa_agent_info_t attribute, void *value) const override; + hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override; - hsa_status_t QueueCreate(size_t size, hsa_queue_type32_t queue_type, uint64_t flags, - core::HsaEventCallback event_callback, void* data, - uint32_t private_segment_size, uint32_t group_segment_size, - core::Queue** queue) override; + hsa_status_t QueueCreate(size_t size, hsa_queue_type32_t queue_type, uint64_t flags, + core::HsaEventCallback event_callback, void* data, + uint32_t private_segment_size, uint32_t group_segment_size, + core::Queue** queue) override; - // @brief Override from core::Agent. - const std::vector& supported_isas() const override { - return supported_isas_; - } + // @brief Override from core::Agent. + const std::vector& supported_isas() const override { return supported_isas_; } - const std::vector ®ions() const override { - return regions_; - } + const std::vector& regions() const override { return regions_; } - /// @brief Getter for the AIE system allocator. - const std::function & - system_allocator() const { - return system_allocator_; - } + /// @brief Getter for the AIE system allocator. + const std::function& + system_allocator() const { + return system_allocator_; + } /// @brief Getter for the AIE system deallocator. const std::function& system_deallocator() const { return system_deallocator_; } + const HsaNodeProperties& properties() const { return node_props_; } // AIE agent methods. /// @brief Get the number of columns on this AIE agent. uint32_t GetNumCols() const { return num_cols_; } @@ -134,6 +126,7 @@ private: const uint32_t max_aql_size_ = 0x40; const uint32_t max_queues_ = 1; + const HsaNodeProperties node_props_; /// @brief Number of columns in the AIE array. uint32_t num_cols_ = 0; /// @brief Number of rows of core tiles in the AIE array. Not all rows in a diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index 8e6739722f..d007d00c2c 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -103,14 +103,6 @@ class AieAqlQueue : public core::Queue, /// @brief Returns the agent associated with this queue. AieAgent& GetAgent() { return agent_; } - /// @brief Sets the hardware context. - void SetHwCtxHandle(uint32_t hw_ctx_handle) { - hw_ctx_handle_ = hw_ctx_handle; - } - - /// @brief Returns the hardware context. - uint32_t GetHwCtxHandle() const { return hw_ctx_handle_; } - // GPU-specific queue functions are unsupported. hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, @@ -141,17 +133,6 @@ class AieAqlQueue : public core::Queue, /// @brief Called when the doorbell is rung to submit all queued packets. void SubmitPackets(); - /// @brief Handle for an application context on the AIE device. - /// - /// Each user queue will have an associated context. This handle is assigned - /// by the driver on context creation. - /// - /// TODO: For now we support a single context that allocates all core tiles in - /// the array. In the future we can make the number of tiles configurable so - /// that multiple workloads with different core tile configurations can - /// execute on the AIE agent at the same time. - uint32_t hw_ctx_handle_ = std::numeric_limits::max(); - /// @brief Indicates if queue is active. std::atomic active_; static __forceinline int& rtti_id() { diff --git a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h index 8852d40c1b..a5e62ca273 100644 --- a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h +++ b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// -// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. -// +// +// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved. +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -47,8 +47,6 @@ #include #include -#include "hsakmt/hsakmt.h" - #include "core/inc/amd_gpu_agent.h" #include "core/inc/blit.h" #include "core/inc/runtime.h" diff --git a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h index 4579b6e8b0..befa19535e 100644 --- a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h @@ -97,8 +97,17 @@ public: void **mem, size_t size, uint32_t node_id) override; hsa_status_t FreeMemory(void *mem, size_t size) override; - hsa_status_t CreateQueue(core::Queue &queue) const override; - hsa_status_t DestroyQueue(core::Queue &queue) const override; + hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct, + HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr, + uint64_t queue_size_bytes, HsaEvent* event, + HsaQueueResource& queue_resource) const override; + hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority, + void* queue_addr, uint64_t queue_size, HsaEvent* event) const override; + hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override; + hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count, + uint32_t* queue_cu_mask) const override; + hsa_status_t AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws, + uint32_t* first_gws) const override; hsa_status_t ExportDMABuf(void *mem, size_t size, int *dmabuf_fd, size_t *offset) override; hsa_status_t ImportDMABuf(int dmabuf_fd, core::Agent &agent, diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 41eeed203c..575873023f 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -95,8 +95,6 @@ class Queue; namespace AMD { -class AieAqlQueue; - // @brief: Used to transform an address into a device address constexpr uint32_t DEV_ADDR_BASE = 0x04000000; constexpr uint32_t DEV_ADDR_OFFSET_MASK = 0x02FFFFFF; @@ -209,8 +207,17 @@ public: void **mem, size_t size, uint32_t node_id) override; hsa_status_t FreeMemory(void *mem, size_t size) override; - hsa_status_t CreateQueue(core::Queue &queue) const override; - hsa_status_t DestroyQueue(core::Queue &queue) const override; + hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct, + HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, void* queue_addr, + uint64_t queue_size_bytes, HsaEvent* event, + HsaQueueResource& queue_resource) const override; + hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, HSA_QUEUE_PRIORITY priority, + void* queue_addr, uint64_t queue_size, HsaEvent* event) const override; + hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const override; + hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count, + uint32_t* queue_cu_mask) const override; + hsa_status_t AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws, + uint32_t* first_gws) const override; hsa_status_t ExportDMABuf(void *mem, size_t size, int *dmabuf_fd, size_t *offset) override; hsa_status_t ImportDMABuf(int dmabuf_fd, core::Agent &agent, @@ -223,7 +230,7 @@ public: /// @brief Submits @p num_pkts packets in a command chain. hsa_status_t SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uint32_t num_pkts, - AieAqlQueue& aie_queue); + HSA_QUEUEID& queue_id, uint32_t num_core_tiles); hsa_status_t SPMAcquire(uint32_t preferred_node_id) const override; hsa_status_t SPMRelease(uint32_t preferred_node_id) const override; @@ -243,7 +250,8 @@ public: BOHandle FindBOHandle(void* mem) const; /// @brief Creates a new hardware context with the given PDI BO handles. - hsa_status_t ConfigHwCtx(const PDICache& pdi_bo_handles, AieAqlQueue& aie_queue); + hsa_status_t ConfigHwCtx(const PDICache& pdi_bo_handles, HSA_QUEUEID& queue_id, + uint32_t num_core_tiles); hsa_status_t QueryDriverVersion(); @@ -274,7 +282,7 @@ public: /// @param bo_handles handles associated with the command /// @param aie_queue queue to submit to hsa_status_t ExecCmdAndWait(const BOHandle& cmd_chain_bo_handle, - const std::vector& bo_handles, AieAqlQueue& aie_queue); + const std::vector& bo_handles, HSA_QUEUEID queue_id); /// TODO: Remove this in the future and rely on the core Runtime /// object to track handle allocations. Using the VMEM API for mapping XDNA diff --git a/runtime/hsa-runtime/core/inc/driver.h b/runtime/hsa-runtime/core/inc/driver.h index 0cf3dd71fe..15d0093795 100644 --- a/runtime/hsa-runtime/core/inc/driver.h +++ b/runtime/hsa-runtime/core/inc/driver.h @@ -43,6 +43,7 @@ #ifndef HSA_RUNTME_CORE_INC_DRIVER_H_ #define HSA_RUNTME_CORE_INC_DRIVER_H_ +#include #include #include @@ -146,9 +147,53 @@ public: virtual hsa_status_t FreeMemory(void *mem, size_t size) = 0; - virtual hsa_status_t CreateQueue(Queue &queue) const = 0; + /// @brief Create an agent dispatch queue with user-mode access rights. + /// @param[in] node_id Node ID of the agent on which the queue is being created. + /// @param[in] type Queue's type. + /// @param[in] queue_pct Maximum percentage of a queue's occupancy allowed. + /// @param[in] priority Queue's priority for scheduling. + /// @param[in] sdma_engine_id ID of the SDMA engine on which the queue is being created. Only used + /// if @p type is one of the SDMA queue types. + /// @param[in] queue_addr Address of the queue's ring buffer. + /// @param[in] queue_size_bytes Size of the queue's ring buffer in bytes. + /// @param[in] event HsaEvent for event-driven callbacks. + /// @param[out] queue_resource Queue resource information populated by the driver. + virtual hsa_status_t CreateQueue(uint32_t node_id, HSA_QUEUE_TYPE type, uint32_t queue_pct, + HSA_QUEUE_PRIORITY priority, uint32_t sdma_engine_id, + void* queue_addr, uint64_t queue_size_bytes, HsaEvent* event, + HsaQueueResource& queue_resource) const = 0; - virtual hsa_status_t DestroyQueue(Queue &queue) const = 0; + /// @brief Destroy a queue. + /// @param queue_id Kernel-mode driver's assigned queue ID. + virtual hsa_status_t DestroyQueue(HSA_QUEUEID queue_id) const = 0; + + /// @brief Update a queue's properties. + /// @param[in] queue_id Kernel-mode driver's assigned queue ID. + /// @param[in] queue_pct Maximum percentage of a queue's occupancy allowed. + /// @param[in] priority Queue's priority for scheduling. + /// @param[in] queue_addr Queue's ring buffer base address. + /// @param[in] queue_size_bytes Size of the queue's ring buffer in bytes. + /// @param[in] event HsaEvent for event-driven callbacks. + virtual hsa_status_t UpdateQueue(HSA_QUEUEID queue_id, uint32_t queue_pct, + HSA_QUEUE_PRIORITY priority, void* queue_addr, + uint64_t queue_size_bytes, HsaEvent* event) const = 0; + + /// @brief Set the CU mask for a queue. + /// @details This sets the CU bitmask for a queue. The CU mask determines which CUs + /// a queue's dispatches can target. Currently this is only supported for GPU devices. + /// @param[in] queue_id Kernel-mode driver's assigned queue ID. + /// @param[in] cu_mask_count Number of CU bits in the mask. + /// @param[in] queue_cu_mask New CU mask for the queue. + virtual hsa_status_t SetQueueCUMask(HSA_QUEUEID queue_id, uint32_t cu_mask_count, + uint32_t* queue_cu_mask) const = 0; + + /// @brief Allocate global wave sync (GWS) resource for a queue. This is only supported for GPUs. + /// GWS can be used to synchronize wavefronts across the entire GPU device. + /// @param[in] queue_id Kernel-mode driver's assigned queue ID. + /// @param[in] num_gws Number of GWS slots. + /// @param[in] first_gws First GWS slot. + virtual hsa_status_t AllocQueueGWS(HSA_QUEUEID queue_id, uint32_t num_gws, + uint32_t* first_gws) const = 0; /// @brief Imports memory using dma-buf. /// diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp index cb3428ce92..1bb319abd8 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp @@ -55,10 +55,10 @@ namespace rocr { namespace AMD { -AieAgent::AieAgent(uint32_t node) - : core::Agent(core::Runtime::runtime_singleton_->AgentDriver( - core::DriverType::XDNA), - node, core::Agent::DeviceType::kAmdAieDevice) { +AieAgent::AieAgent(uint32_t node, const HsaNodeProperties& node_props) + : core::Agent(core::Runtime::runtime_singleton_->AgentDriver(core::DriverType::XDNA), node, + core::Agent::DeviceType::kAmdAieDevice), + node_props_(node_props) { InitRegionList(); InitAllocators(); GetAgentProperties(); diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index e182ffaa39..4b22f5419e 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -102,16 +102,26 @@ AieAqlQueue::AieAqlQueue(core::SharedQueue* shared_queue, AieAgent* agent, size_ signal_.queue_ptr = &amd_queue_; active_ = true; - auto &drv = static_cast(agent_.driver()); - drv.CreateQueue(*this); + HsaQueueResource queue_resource = {}; + hsa_status_t status = + agent_.driver().CreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 0, HSA_QUEUE_PRIORITY_NORMAL, 0, + nullptr, queue_size_bytes_, nullptr, queue_resource); + if (status != HSA_STATUS_SUCCESS) { + throw AMD::hsa_exception(status, "Failed to create a hardware context for an AIE queue."); + } + + queue_id_ = queue_resource.QueueId; + amd_queue_.hsa_queue.id = GetQueueId(); } AieAqlQueue::~AieAqlQueue() { AieAqlQueue::Inactivate(); - - if (ring_buf_) agent_.system_deallocator()(ring_buf_); - - if (shared_queue_) core::Runtime::runtime_singleton_->system_deallocator()(shared_queue_); + if (ring_buf_) { + agent_.system_deallocator()(ring_buf_); + } + if (shared_queue_) { + core::Runtime::runtime_singleton_->system_deallocator()(shared_queue_); + } } hsa_status_t AieAqlQueue::Inactivate() { @@ -119,9 +129,7 @@ hsa_status_t AieAqlQueue::Inactivate() { hsa_status_t status(HSA_STATUS_SUCCESS); if (active) { - auto &drv = static_cast(agent_.driver()); - status = drv.DestroyQueue(*this); - hw_ctx_handle_ = std::numeric_limits::max(); + agent_.driver().DestroyQueue(queue_id_); } return status; @@ -237,7 +245,8 @@ void AieAqlQueue::SubmitPackets() { // Call into the driver to submit from cur_id to write_dispatch_id. // Submitting the command chain might create a new hardware context. - hsa_status_t status = driver.SubmitCmdChain(pkt, num_cont_start_cu_pkts, *this); + hsa_status_t status = driver.SubmitCmdChain(pkt, num_cont_start_cu_pkts, queue_id_, + agent_.properties().NumNeuralCores); if (status != HSA_STATUS_SUCCESS) { assert(false && "Could not submit packets"); } diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp index a9384dfcc0..99b46df41f 100644 --- a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -195,10 +195,9 @@ hsa_status_t BlitSdma: // boolean flag const HSA_QUEUE_TYPE kQueueType_ = rec_eng >= 0 ? HSA_QUEUE_SDMA_BY_ENG_ID : (use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA); - if (HSAKMT_STATUS_SUCCESS != HSAKMT_CALL(hsaKmtCreateQueueExt(agent_->node_id(), kQueueType_, 100, - HSA_QUEUE_PRIORITY_MAXIMUM, rec_eng, - queue_start_addr_, kQueueSize, NULL, - &queue_resource_))) { + if (agent_->driver().CreateQueue(agent_->node_id(), kQueueType_, 100, HSA_QUEUE_PRIORITY_MAXIMUM, + rec_eng, queue_start_addr_, kQueueSize, nullptr, + queue_resource_) != HSA_STATUS_SUCCESS) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } diff --git a/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/runtime/hsa-runtime/core/runtime/amd_topology.cpp index 18bb7a00bc..ac85ca42cc 100644 --- a/runtime/hsa-runtime/core/runtime/amd_topology.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_topology.cpp @@ -173,7 +173,7 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnac } void DiscoverAie(uint32_t node_id, HsaNodeProperties& node_prop) { - AieAgent* aie = new AieAgent(node_id); + AieAgent* aie = new AieAgent(node_id, node_prop); core::Runtime::runtime_singleton_->RegisterAgent(aie, true); }