From 3a9d14bb660649130fb40c260ecb36ac5599f21f Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Thu, 5 Jun 2025 16:38:38 -0500 Subject: [PATCH] rocr: Add hsa_amd_portable_export_dmabuf_v2 The original version of hsa_amd_portable_export_dmabuf() did not consider the conditions under which a dmabuf could be shared. In the new version (hsa_amd_portable_export_dmabuf_v2()), the caller can specify the flag HSA_AMD_DMABUF_MAPPING_TYPE_PCIE, which means they want to share the dmabuf over PCIe. In that case, the new code will check that if it is a PCIe GPU and it is not in a XGMI Hive then if large-BAR is not supported, we will return an error. --- CMakeLists.txt | 2 +- .../core/common/hsa_table_interface.cpp | 6 ++ .../hsa-runtime/core/inc/hsa_ext_amd_impl.h | 3 + runtime/hsa-runtime/core/inc/runtime.h | 3 +- .../core/runtime/hsa_api_trace.cpp | 3 +- .../hsa-runtime/core/runtime/hsa_ext_amd.cpp | 18 +++- runtime/hsa-runtime/core/runtime/runtime.cpp | 12 ++- runtime/hsa-runtime/hsacore.so.def | 1 + runtime/hsa-runtime/inc/hsa_api_trace.h | 1 + .../hsa-runtime/inc/hsa_api_trace_version.h | 2 +- runtime/hsa-runtime/inc/hsa_ext_amd.h | 86 +++++++++++++++---- 11 files changed, 114 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fe99908d9..935e430807 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,7 +87,7 @@ include(utils) ## Get version strings -get_version("1.16.0") +get_version("1.17.0") if (${ROCM_PATCH_VERSION}) set(VERSION_PATCH ${ROCM_PATCH_VERSION}) endif() diff --git a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp index 47427de931..656f601edd 100644 --- a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp +++ b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp @@ -1246,6 +1246,12 @@ hsa_status_t HSA_API hsa_amd_portable_export_dmabuf(const void* ptr, size_t size return amdExtTable->hsa_amd_portable_export_dmabuf_fn(ptr, size, dmabuf, offset); } +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_portable_export_dmabuf_v2(const void* ptr, size_t size, int* dmabuf, + uint64_t* offset, uint64_t flags) { + return amdExtTable->hsa_amd_portable_export_dmabuf_v2_fn(ptr, size, dmabuf, offset, flags); +} + // Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_portable_close_dmabuf(int dmabuf) { return amdExtTable->hsa_amd_portable_close_dmabuf_fn(dmabuf); diff --git a/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h b/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h index b2d03de55f..4d5fe0162a 100644 --- a/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h +++ b/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h @@ -302,6 +302,9 @@ hsa_status_t HSA_API hsa_amd_spm_set_dest_buffer(hsa_agent_t agent, size_t size, uint32_t* size_copied, void* dest, bool* is_data_loss); +hsa_status_t HSA_API hsa_amd_portable_export_dmabuf_v2(const void* ptr, + size_t size, int* dmabuf, uint64_t* offset, uint64_t flags); + // Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_portable_export_dmabuf(const void* ptr, size_t size, int* dmabuf, uint64_t* offset); diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h index bf725c31c1..b9b495ea0b 100644 --- a/runtime/hsa-runtime/core/inc/runtime.h +++ b/runtime/hsa-runtime/core/inc/runtime.h @@ -375,7 +375,8 @@ class Runtime { hsa_status_t SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, hsa_signal_t completion_signal); - hsa_status_t DmaBufExport(const void* ptr, size_t size, int* dmabuf, uint64_t* offset); + hsa_status_t DmaBufExport(const void* ptr, size_t size, int* dmabuf, + uint64_t* offset, uint64_t flags); hsa_status_t DmaBufClose(int dmabuf); diff --git a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp index e6640b4c8e..8314f8b0ed 100644 --- a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp +++ b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp @@ -87,7 +87,7 @@ void HsaApiTable::Init() { // they can add preprocessor macros on the new functions constexpr size_t expected_core_api_table_size = 1016; - constexpr size_t expected_amd_ext_table_size = 600; + constexpr size_t expected_amd_ext_table_size = 608; constexpr size_t expected_image_ext_table_size = 128; constexpr size_t expected_finalizer_ext_table_size = 64; constexpr size_t expected_tools_table_size = 64; @@ -476,6 +476,7 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_enable_logging_fn = AMD::hsa_amd_enable_logging; amd_ext_api.hsa_amd_signal_wait_all_fn = AMD::hsa_amd_signal_wait_all; amd_ext_api.hsa_amd_memory_get_preferred_copy_engine_fn = AMD::hsa_amd_memory_get_preferred_copy_engine; + amd_ext_api.hsa_amd_portable_export_dmabuf_v2_fn = AMD::hsa_amd_portable_export_dmabuf_v2; } void HsaApiTable::UpdateTools() { diff --git a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index cc27fc1a51..ecc0ff82b6 100644 --- a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -1307,14 +1307,28 @@ hsa_status_t hsa_amd_spm_set_dest_buffer(hsa_agent_t preferred_agent, size_t siz } hsa_status_t hsa_amd_portable_export_dmabuf(const void* ptr, size_t size, int* dmabuf, - uint64_t* offset) { + uint64_t* offset) { +TRY; +IS_OPEN(); +IS_BAD_PTR(ptr); +IS_BAD_PTR(dmabuf); +IS_BAD_PTR(offset); +IS_ZERO(size); +return core::Runtime::runtime_singleton_->DmaBufExport(ptr, size, dmabuf, + offset, HSA_AMD_DMABUF_MAPPING_TYPE_NONE); +CATCH; +} + +hsa_status_t hsa_amd_portable_export_dmabuf_v2(const void* ptr, size_t size, + int* dmabuf, uint64_t* offset, uint64_t flags) { TRY; IS_OPEN(); IS_BAD_PTR(ptr); IS_BAD_PTR(dmabuf); IS_BAD_PTR(offset); IS_ZERO(size); - return core::Runtime::runtime_singleton_->DmaBufExport(ptr, size, dmabuf, offset); + return core::Runtime::runtime_singleton_->DmaBufExport(ptr, size, + dmabuf, offset, flags); CATCH; } diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index 4413a017be..c9450b0119 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -3078,7 +3078,8 @@ Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) { return agents_by_node_[prefetch_node][0]; } -hsa_status_t Runtime::DmaBufExport(const void* ptr, size_t size, int* dmabuf, uint64_t* offset) { +hsa_status_t Runtime::DmaBufExport(const void* ptr, size_t size, int* dmabuf, + uint64_t* offset, uint64_t flags) { #ifdef __linux__ ScopedAcquire lock(memory_lock_.shared()); // Lookup containing allocation. @@ -3093,6 +3094,14 @@ hsa_status_t Runtime::DmaBufExport(const void* ptr, size_t size, int* dmabuf, ui if (mem->second.region->owner()->device_type() != Agent::kAmdGpuDevice) return HSA_STATUS_ERROR_INVALID_AGENT; + rocr::AMD::GpuAgent* owner = + static_cast(mem->second.region->owner()); + + if (flags & HSA_AMD_DMABUF_MAPPING_TYPE_PCIE && + !owner->is_xgmi_cpu_gpu() && + !owner->LargeBarEnabled()) { + return (hsa_status_t)HSA_STATUS_ERROR_NOT_SUPPORTED; + } int fd; uint64_t off; HSAKMT_STATUS err = HSAKMT_CALL(hsaKmtExportDMABufHandle(const_cast(ptr), size, &fd, &off)); @@ -3319,7 +3328,6 @@ hsa_status_t Runtime::VMemoryHandleUnmap(void* va, size_t size) { if (va_chunk != va_ptr + size) { return HSA_STATUS_ERROR_INVALID_ALLOCATION; } - hsa_status_t status; for (auto mappedHandleIt : mappedHandles) { // Remove access from all agents that were allowed access diff --git a/runtime/hsa-runtime/hsacore.so.def b/runtime/hsa-runtime/hsacore.so.def index d8a3e050a8..d2e785a48c 100644 --- a/runtime/hsa-runtime/hsacore.so.def +++ b/runtime/hsa-runtime/hsacore.so.def @@ -234,6 +234,7 @@ global: hsa_amd_spm_release; hsa_amd_spm_set_dest_buffer; hsa_amd_portable_export_dmabuf; + hsa_amd_portable_export_dmabuf_v2; hsa_amd_portable_close_dmabuf; hsa_amd_vmem_address_reserve; hsa_amd_vmem_address_reserve_align; diff --git a/runtime/hsa-runtime/inc/hsa_api_trace.h b/runtime/hsa-runtime/inc/hsa_api_trace.h index b1bfc2e848..6515b19700 100644 --- a/runtime/hsa-runtime/inc/hsa_api_trace.h +++ b/runtime/hsa-runtime/inc/hsa_api_trace.h @@ -270,6 +270,7 @@ struct AmdExtTable { decltype(hsa_amd_enable_logging)* hsa_amd_enable_logging_fn; decltype(hsa_amd_signal_wait_all)* hsa_amd_signal_wait_all_fn; decltype(hsa_amd_memory_get_preferred_copy_engine)* hsa_amd_memory_get_preferred_copy_engine_fn; + decltype(hsa_amd_portable_export_dmabuf_v2)* hsa_amd_portable_export_dmabuf_v2_fn; }; // Table to export HSA Core Runtime Apis diff --git a/runtime/hsa-runtime/inc/hsa_api_trace_version.h b/runtime/hsa-runtime/inc/hsa_api_trace_version.h index 27ebd4c4ca..befd1e26e3 100644 --- a/runtime/hsa-runtime/inc/hsa_api_trace_version.h +++ b/runtime/hsa-runtime/inc/hsa_api_trace_version.h @@ -58,7 +58,7 @@ // Step Ids of the Api tables exported by Hsa Core Runtime #define HSA_API_TABLE_STEP_VERSION 0x01 #define HSA_CORE_API_TABLE_STEP_VERSION 0x00 -#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x06 +#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x07 #define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00 #define HSA_IMAGE_API_TABLE_STEP_VERSION 0x01 // Rocprofiler just checks HSA_MAGE_EXT_API_TABLE_STEP_VERSION diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h index 30412be1f8..3f96f18aef 100644 --- a/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -59,9 +59,10 @@ * - 1.6 - Virtual Memory API: hsa_amd_vmem_address_reserve_align * - 1.7 - hsa_amd_signal_wait_all * - 1.8 - hsa_amd_memory_get_preferred_copy_engine + * - 1.9 - hsa_amd_portable_export_dmabuf_v2 */ #define HSA_AMD_INTERFACE_VERSION_MAJOR 1 -#define HSA_AMD_INTERFACE_VERSION_MINOR 8 +#define HSA_AMD_INTERFACE_VERSION_MINOR 9 #ifdef __cplusplus extern "C" { @@ -446,6 +447,11 @@ enum { * Resource is busy or temporarily unavailable */ HSA_STATUS_ERROR_RESOURCE_BUSY = 46, + + /** + * Request is not supported by this system + */ + HSA_STATUS_ERROR_NOT_SUPPORTED = 47, }; /** @} */ @@ -759,6 +765,17 @@ typedef enum hsa_amd_coherency_type_s { } hsa_amd_coherency_type_t; +/** + * @brief dmabuf attributes + */ +#ifdef __cplusplus +typedef enum hsa_amd_dma_buf_mapping_type_s : int { +#else +typedef enum hsa_amd_dma_buf_mapping_type_s { +#endif + HSA_AMD_DMABUF_MAPPING_TYPE_NONE = 0, + HSA_AMD_DMABUF_MAPPING_TYPE_PCIE = 1 +} hsa_amd_dma_buf_mapping_type_t; /** * @brief Get the coherency type of the fine grain region of an agent. * @@ -3138,21 +3155,10 @@ hsa_status_t hsa_amd_spm_set_dest_buffer(hsa_agent_t preferred_agent, size_t siz */ /** - * @brief Obtains an OS specific, vendor neutral, handle to a memory allocation. + * @brief Older version of hsa_amd_portable_export_dmabuf_v2 * - * Obtains an OS specific handle to GPU agent memory. The memory must be part - * of a single allocation from an hsa_amd_memory_pool_t exposed by a GPU Agent. - * The handle may be used with other APIs (e.g. Vulkan) to obtain shared access - * to the allocation. - * - * Shared access to the memory is not guaranteed to be fine grain coherent even - * if the allocation exported is from a fine grain pool. The shared memory - * consistency model will be no stronger than the model exported from, consult - * the importing API to determine the final consistency model. - * - * The allocation's memory remains valid as long as the handle and any mapping - * of the handle remains valid. When the handle and all mappings are closed - * the backing memory will be released for reuse. + * This is the same as calling hsa_amd_portable_export_dmabuf_v2() with the + * flags argument set to HSA_AMD_DMABUF_MAPPING_TYPE_NONE. * * @param[in] ptr Pointer to the allocation being exported. * @@ -3185,6 +3191,56 @@ hsa_status_t hsa_amd_spm_set_dest_buffer(hsa_agent_t preferred_agent, size_t siz hsa_status_t hsa_amd_portable_export_dmabuf(const void* ptr, size_t size, int* dmabuf, uint64_t* offset); + /** + * @brief Obtains an OS specific, vendor neutral, handle to a memory allocation. + * + * Obtains an OS specific handle to GPU agent memory. The memory must be part + * of a single allocation from an hsa_amd_memory_pool_t exposed by a GPU Agent. + * The handle may be used with other APIs (e.g. Vulkan) to obtain shared access + * to the allocation. + * + * Shared access to the memory is not guaranteed to be fine grain coherent even + * if the allocation exported is from a fine grain pool. The shared memory + * consistency model will be no stronger than the model exported from, consult + * the importing API to determine the final consistency model. + * + * The allocation's memory remains valid as long as the handle and any mapping + * of the handle remains valid. When the handle and all mappings are closed + * the backing memory will be released for reuse. + * + * @param[in] ptr Pointer to the allocation being exported. + * + * @param[in] size Size in bytes to export following @p ptr. The entire range + * being exported must be contained within a single allocation. + * + * @param[out] dmabuf Pointer to a dma-buf file descriptor holding a reference to the + * allocation. Contents will not be altered in the event of failure. + * + * @param[out] offset Offset in bytes into the memory referenced by the dma-buf + * object at which @p ptr resides. Contents will not be altered in the event + * of failure. + * + * @param[in] flags Bitmask of hsa_amd_dma_buf_mapping_type_t flags. + * + * @retval ::HSA_STATUS_SUCCESS Export completed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT One or more arguments is NULL. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The address range described by + * @p ptr and @p size are not contained within a single allocation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The allocation described by @p ptr + * and @p size was allocated on a device which can not export memory. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The return file descriptor, + * @p dmabuf, could not be created. + */ +hsa_status_t hsa_amd_portable_export_dmabuf_v2(const void* ptr, size_t size, + int* dmabuf, uint64_t* offset, uint64_t flags); + /** * @brief Closes an OS specific, vendor neutral, handle to a memory allocation. *