From 59e91f0be8386dcbd2fe50d97edae74892d7a111 Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Tue, 26 Mar 2019 02:06:21 -0500 Subject: [PATCH] Add hsa_amd_memory_lock_to_pool. Makes malloc memory accessible to GPUs so that the memory has the capabilities of the pool it is locked to. This admits fine grained locked memory and reserves API space for any future special CPU pools. Change-Id: If8c3dd8582a43f19d3d36b3763c1a688cc419ef0 [ROCm/ROCR-Runtime commit: a535e18cc11843662279fc0f69957050375d90fb] --- .../core/common/hsa_table_interface.cpp | 8 +++ .../hsa-runtime/core/inc/amd_memory_region.h | 3 +- .../hsa-runtime/core/inc/hsa_ext_amd_impl.h | 5 ++ .../core/runtime/amd_memory_region.cpp | 16 +---- .../core/runtime/hsa_api_trace.cpp | 1 + .../hsa-runtime/core/runtime/hsa_ext_amd.cpp | 42 +++++++++-- .../runtime/hsa-runtime/hsacore.so.def | 3 +- .../runtime/hsa-runtime/inc/hsa_api_trace.h | 1 + .../runtime/hsa-runtime/inc/hsa_ext_amd.h | 71 ++++++++++++++++--- 9 files changed, 120 insertions(+), 30 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp index 7fcd1d1723..bf501cd39f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/common/hsa_table_interface.cpp @@ -1015,6 +1015,14 @@ hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, host_ptr, size, agents, num_agent, agent_ptr); } +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_t* agents, + int num_agent, hsa_amd_memory_pool_t pool, + uint32_t flags, void** agent_ptr) { + return amdExtTable->hsa_amd_memory_lock_to_pool_fn(host_ptr, size, agents, num_agent, pool, flags, + agent_ptr); +} + // Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr) { return amdExtTable->hsa_amd_memory_unlock_fn(host_ptr); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h index 8a1ca3ea7c..e96a21c809 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -82,8 +82,7 @@ class MemoryRegion : public core::MemoryRegion { /// @brief Free agent accessible memory (system / local memory). static void FreeKfdMemory(void* ptr, size_t size); - static bool RegisterMemory(void* ptr, size_t size, size_t num_nodes, - const uint32_t* nodes); + static bool RegisterMemory(void* ptr, size_t size, const HsaMemFlags& MemFlags); static void DeregisterMemory(void* ptr); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h index fcbd2eff33..17d927d9a1 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h @@ -173,6 +173,11 @@ hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, hsa_agent_t* agents, int num_agent, void** agent_ptr); +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_t* agents, + int num_agent, hsa_amd_memory_pool_t pool, + uint32_t flags, void** agent_ptr); + // Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index d2bb4e9b61..e0bc2be05c 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -68,15 +68,11 @@ void MemoryRegion::FreeKfdMemory(void* ptr, size_t size) { assert(status == HSAKMT_STATUS_SUCCESS); } -bool MemoryRegion::RegisterMemory(void* ptr, size_t size, size_t num_nodes, - const uint32_t* nodes) { +bool MemoryRegion::RegisterMemory(void* ptr, size_t size, const HsaMemFlags& MemFlags) { assert(ptr != NULL); assert(size != 0); - assert(num_nodes != 0); - assert(nodes != NULL); - const HSAKMT_STATUS status = hsaKmtRegisterMemoryToNodes( - ptr, size, num_nodes, const_cast(nodes)); + const HSAKMT_STATUS status = hsaKmtRegisterMemoryWithFlags(ptr, size, MemFlags); return (status == HSAKMT_STATUS_SUCCESS); } @@ -120,8 +116,6 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owne (mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE) ? 0 : 1; mem_flag_.ui32.NonPaged = 1; - map_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; - virtual_size_ = kGpuVmSize; } else if (IsSystem()) { mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; @@ -129,9 +123,6 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owne mem_flag_.ui32.HostAccess = 1; mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED; - map_flag_.ui32.HostAccess = 1; - map_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; - virtual_size_ = (full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize; } @@ -584,8 +575,7 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents, } // Call kernel driver to register and pin the memory. - if (RegisterMemory(host_ptr, size, whitelist_nodes.size(), - &whitelist_nodes[0])) { + if (RegisterMemory(host_ptr, size, mem_flag_)) { uint64_t alternate_va = 0; if (MakeKfdMemoryResident(whitelist_nodes.size(), &whitelist_nodes[0], host_ptr, size, &alternate_va, map_flag_)) { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp index 6e18860ebb..b7a71d4534 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp @@ -386,6 +386,7 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_queue_set_priority_fn = AMD::hsa_amd_queue_set_priority; amd_ext_api.hsa_amd_memory_async_copy_rect_fn = AMD::hsa_amd_memory_async_copy_rect; amd_ext_api.hsa_amd_runtime_queue_create_register_fn = AMD::hsa_amd_runtime_queue_create_register; + amd_ext_api.hsa_amd_memory_lock_to_pool_fn = AMD::hsa_amd_memory_lock_to_pool; } class Init { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index 3e690ff35a..b4fd546431 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -549,14 +549,48 @@ hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size, return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - const amd::MemoryRegion* system_region = - reinterpret_cast( - core::Runtime::runtime_singleton_->system_regions_fine()[0]); + // Check for APU + if (core::Runtime::runtime_singleton_->system_regions_coarse().size() == 0) { + assert(core::Runtime::runtime_singleton_->system_regions_fine()[0]->full_profile() && + "Missing coarse grain host memory on dGPU system."); + *agent_ptr = host_ptr; + return HSA_STATUS_SUCCESS; + } + + const amd::MemoryRegion* system_region = static_cast( + core::Runtime::runtime_singleton_->system_regions_coarse()[0]); return system_region->Lock(num_agent, agents, host_ptr, size, agent_ptr); CATCH; } +hsa_status_t hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_t* agents, + int num_agent, hsa_amd_memory_pool_t pool, uint32_t flags, + void** agent_ptr) { + TRY; + IS_OPEN(); + *agent_ptr = NULL; + + if (size == 0 || host_ptr == NULL || agent_ptr == NULL || flags != 0) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + if ((agents != NULL && num_agent == 0) || (agents == NULL && num_agent != 0)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_region_t region = {pool.handle}; + const amd::MemoryRegion* mem_region = amd::MemoryRegion::Convert(region); + if (mem_region == nullptr) { + return (hsa_status_t)HSA_STATUS_ERROR_INVALID_MEMORY_POOL; + } + if (mem_region->owner()->device_type() != core::Agent::kAmdCpuDevice) + return (hsa_status_t)HSA_STATUS_ERROR_INVALID_MEMORY_POOL; + + return mem_region->Lock(num_agent, agents, host_ptr, size, agent_ptr); + CATCH; +} + hsa_status_t hsa_amd_memory_unlock(void* host_ptr) { TRY; IS_OPEN(); @@ -615,7 +649,7 @@ hsa_status_t hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, siz TRY; IS_OPEN(); - if (size == 0 || ptr == NULL) { + if (size == 0 || ptr == NULL || flags != 0) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def index cce468b985..b04afb21fe 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def +++ b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def @@ -181,7 +181,9 @@ global: hsa_amd_queue_cu_set_mask; hsa_amd_memory_fill; hsa_amd_memory_async_copy; + hsa_amd_memory_async_copy_rect; hsa_amd_memory_lock; + hsa_amd_memory_lock_to_pool; hsa_amd_memory_unlock; hsa_amd_agent_iterate_memory_pools; hsa_amd_agent_memory_pool_get_info; @@ -216,7 +218,6 @@ global: hsa_amd_ipc_signal_attach; hsa_amd_register_system_event_handler; hsa_amd_queue_set_priority; - hsa_amd_memory_async_copy_rect; local: *; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h index a9971b2299..2bdf229312 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h @@ -179,6 +179,7 @@ struct AmdExtTable { decltype(hsa_amd_queue_set_priority)* hsa_amd_queue_set_priority_fn; decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; decltype(hsa_amd_runtime_queue_create_register)* hsa_amd_runtime_queue_create_register_fn; + decltype(hsa_amd_memory_lock_to_pool)* hsa_amd_memory_lock_to_pool_fn; }; // Table to export HSA Core Runtime Apis diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h index ccd3f82045..646ccbead2 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -778,7 +778,7 @@ hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool. * * @param[in] flags A bit-field that is used to specify allocation - * directives. Must be 0. + * directives. Reserved parameter, must be 0. * * @param[out] ptr Pointer to the location where to store the base virtual * address of @@ -799,7 +799,8 @@ hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( * allocate memory in @p memory_pool, or @p size is greater than the value of * HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE in @p memory_pool. * - * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0. + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0, + * or flags is not 0. * */ hsa_status_t HSA_API @@ -1204,11 +1205,12 @@ hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr, /** * - * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and return a new - * pointer accessible by the @p agents. If the @p host_ptr overlaps with previously locked - * memory, then the overlap area is kept locked (i.e multiple mappings are permitted). In this case, - * the same input @p host_ptr may give different locked @p agent_ptr and when it does, they - * are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent). + * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and + * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously + * locked memory, then the overlap area is kept locked (i.e multiple mappings are permitted). In + * this case, the same input @p host_ptr may give different locked @p agent_ptr and when it does, + * they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent). + * Accesses to @p agent_ptr are coarse grained. * * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator. * @@ -1235,20 +1237,69 @@ hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr, * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents * is NULL but @p num_agent is not 0. */ - hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, hsa_agent_t* agents, int num_agent, void** agent_ptr); /** * - * @brief Unpin the host pointer previously pinned via ::hsa_amd_memory_lock. + * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and + * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously + * locked memory, then the overlap area is kept locked (i.e. multiple mappings are permitted). + * In this case, the same input @p host_ptr may give different locked @p agent_ptr and when it + * does, they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent). + * Acesses to the memory via @p agent_ptr have the same access properties as memory allocated from + * @p pool as determined by ::hsa_amd_memory_pool_get_info and ::hsa_amd_agent_memory_pool_get_info + * (ex. coarse/fine grain, platform atomic support, link info). Physical composition and placement + * of the memory (ex. page size, NUMA binding) is not changed. + * + * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator. + * + * @param[in] size The size to be locked. + * + * @param[in] agents Array of agent handle to gain access to the @p host_ptr. + * If this parameter is NULL and the @p num_agent is 0, all agents + * in the platform will gain access to the @p host_ptr. + * + * @param[in] pool Global memory pool owned by a CPU agent. + * + * @param[in] flags A bit-field that is used to specify allocation + * directives. Reserved parameter, must be 0. + * + * @param[out] agent_ptr Pointer to the location where to store the new address. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is + * invalid or can not access @p pool. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL @p pool is invalid or not owned + * by a CPU agent. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or + * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents + * is NULL but @p num_agent is not 0 or flags is not 0. + */ +hsa_status_t HSA_API hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_t* agents, + int num_agent, hsa_amd_memory_pool_t pool, + uint32_t flags, void** agent_ptr); + +/** + * + * @brief Unpin the host pointer previously pinned via ::hsa_amd_memory_lock or + * ::hsa_amd_memory_lock_to_pool. * * @details The behavior is undefined if the host pointer being unpinned does not * match previous pinned address or if the host pointer was already deallocated. * * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator that was - * pinned previously via ::hsa_amd_memory_lock. + * pinned previously via ::hsa_amd_memory_lock or ::hsa_amd_memory_lock_to_pool. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. *