From bacf61dde9809dbe4e04fbd95d23476f78aef901 Mon Sep 17 00:00:00 2001 From: Honglei Huang Date: Tue, 1 Jul 2025 14:00:37 +0800 Subject: [PATCH] rocr/driver: add GetDeviceHandle to driver interface This commit introduces a new GetDeviceHandle API to the driver interface, allowing retrieval of the device handle for a specific node. - Implemented GetDeviceHandle in KfdDriver to fetch the AMD GPU device handle using hsaKmtGetAMDGPUDeviceHandle. - Added a stub implementation of GetDeviceHandle in XdnaDriver that returns HSA_STATUS_ERROR. - Modified GpuAgent::InitLibDrm to use driver().GetDeviceHandle instead of directly calling hsaKmtGetAMDGPUDeviceHandle. Signed-off-by: Honglei Huang [ROCm/ROCR-Runtime commit: 05b83e72d9a72a7febd03e717cb1cc53b774e4b2] --- .../hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp | 9 +++++++++ .../hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp | 4 ++++ .../runtime/hsa-runtime/core/inc/amd_kfd_driver.h | 1 + .../runtime/hsa-runtime/core/inc/amd_xdna_driver.h | 1 + .../rocr-runtime/runtime/hsa-runtime/core/inc/driver.h | 7 +++++++ .../runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp | 8 ++++---- 6 files changed, 26 insertions(+), 4 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp index 48506c60e9..8271cb5904 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp @@ -574,6 +574,15 @@ hsa_status_t KfdDriver::SetTrapHandler(uint32_t node_id, const void* base, uint6 return HSA_STATUS_SUCCESS; } +hsa_status_t KfdDriver::GetDeviceHandle(uint32_t node_id, void** device_handle) const { + assert(device_handle); + + if (HSAKMT_CALL(hsaKmtGetAMDGPUDeviceHandle(node_id, reinterpret_cast(device_handle))) != HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + hsa_status_t KfdDriver::IsModelEnabled(bool* enable) const { // AIE does not support streaming performance monitor. HSAKMT_STATUS status = HSAKMT_STATUS_ERROR; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index cec114df16..52c50d92f4 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -877,5 +877,9 @@ hsa_status_t XdnaDriver::SetTrapHandler(uint32_t node_id, const void* base, uint return HSA_STATUS_ERROR; } +hsa_status_t XdnaDriver::GetDeviceHandle(uint32_t node_id, void** device_handle) const { + return HSA_STATUS_ERROR; +} + } // namespace AMD } // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_kfd_driver.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_kfd_driver.h index 8eac1e38ff..7932f373ff 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_kfd_driver.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_kfd_driver.h @@ -124,6 +124,7 @@ public: bool* is_spm_data_loss) const override; hsa_status_t SetTrapHandler(uint32_t node_id, const void* base, uint64_t base_size, const void* buffer_base, uint64_t buffer_base_size) const override; + hsa_status_t GetDeviceHandle(uint32_t node_id, void** device_handle) const override; hsa_status_t OpenSMI(uint32_t node_id, int* fd) const override; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 814bd0e1a3..42833e5aef 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -238,6 +238,7 @@ public: bool* is_spm_data_loss) const override; hsa_status_t SetTrapHandler(uint32_t node_id, const void* base, uint64_t base_size, const void* buffer_base, uint64_t buffer_base_size) const override; + hsa_status_t GetDeviceHandle(uint32_t node_id, void** device_handle) const override; hsa_status_t IsModelEnabled(bool* enable) const override; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/driver.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/driver.h index 1b578a54dd..21add45cb5 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/driver.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/driver.h @@ -269,6 +269,13 @@ public: virtual hsa_status_t SetTrapHandler(uint32_t node_id, const void* base, uint64_t base_size, const void* buffer_base, uint64_t buffer_base_size) const = 0; + /// @brief Gets the device handle for a specific node. + /// @param node_id Node ID of the agent + /// @param device_handle Device handle + /// @return HSA_STATUS_SUCCESS if the driver successfully returns the device + virtual hsa_status_t GetDeviceHandle(uint32_t node_id, void** device_handle) const = 0; + + /// @brief Check if the HSA KMT Model is enabled /// @param[out] enable True if the model is enabled, false otherwise virtual hsa_status_t IsModelEnabled(bool* enable) const = 0; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index bee8664090..013fa119f5 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -601,12 +601,12 @@ void GpuAgent::InitCacheList() { } void GpuAgent::InitLibDrm() { - HSAKMT_STATUS status; + hsa_status_t status; HsaAMDGPUDeviceHandle device_handle; - status = HSAKMT_CALL(hsaKmtGetAMDGPUDeviceHandle(node_id(), &device_handle)); - if (status != HSAKMT_STATUS_SUCCESS) - throw AMD::hsa_exception(HSA_STATUS_ERROR, + status = driver().GetDeviceHandle(node_id(), &device_handle); + if (status != HSA_STATUS_SUCCESS) + throw AMD::hsa_exception(status, "Agent creation failed.\nlibdrm get device handle failed.\n"); ldrm_dev_ = (amdgpu_device_handle)device_handle;