diff --git a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp index f3e6f05419..894c933432 100644 --- a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp +++ b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp @@ -49,6 +49,10 @@ #include "hsakmt/hsakmt.h" +#include "core/inc/amd_cpu_agent.h" +#include "core/inc/amd_gpu_agent.h" +#include "core/inc/amd_memory_region.h" +#include "core/inc/exceptions.h" #include "core/inc/runtime.h" namespace rocr { @@ -70,18 +74,155 @@ hsa_status_t KfdDriver::QueryKernelModeDriver(core::DriverQuery query) { return HSA_STATUS_SUCCESS; } -hsa_status_t KfdDriver::GetMemoryProperties(uint32_t node_id, - core::MemProperties &mprops) const { +hsa_status_t +KfdDriver::GetMemoryProperties(uint32_t node_id, + core::MemoryRegion &mem_region) const { return HSA_STATUS_SUCCESS; } -hsa_status_t KfdDriver::AllocateMemory(void **mem, size_t size, - uint32_t node_id, core::MemFlags flags) { - return HSA_STATUS_SUCCESS; +hsa_status_t +KfdDriver::AllocateMemory(const core::MemoryRegion &mem_region, + core::MemoryRegion::AllocateFlags alloc_flags, + void **mem, size_t size, uint32_t agent_node_id) { + const MemoryRegion &m_region(static_cast(mem_region)); + HsaMemFlags kmt_alloc_flags(m_region.mem_flags()); + + kmt_alloc_flags.ui32.ExecuteAccess = + (alloc_flags & core::MemoryRegion::AllocateExecutable ? 1 : 0); + kmt_alloc_flags.ui32.AQLQueueMemory = + (alloc_flags & core::MemoryRegion::AllocateDoubleMap ? 1 : 0); + + if (m_region.IsSystem() && + (alloc_flags & core::MemoryRegion::AllocateNonPaged)) { + kmt_alloc_flags.ui32.NonPaged = 1; + } + + // Allocating a memory handle for virtual memory + kmt_alloc_flags.ui32.NoAddress = + !!(alloc_flags & core::MemoryRegion::AllocateMemoryOnly); + + // Allocate pseudo fine grain memory + kmt_alloc_flags.ui32.CoarseGrain = + (alloc_flags & core::MemoryRegion::AllocatePCIeRW + ? 0 + : kmt_alloc_flags.ui32.CoarseGrain); + + kmt_alloc_flags.ui32.NoSubstitute = + (alloc_flags & core::MemoryRegion::AllocatePinned + ? 1 + : kmt_alloc_flags.ui32.NoSubstitute); + + kmt_alloc_flags.ui32.GTTAccess = + (alloc_flags & core::MemoryRegion::AllocateGTTAccess + ? 1 + : kmt_alloc_flags.ui32.GTTAccess); + + if (m_region.IsLocalMemory()) { + // Allocate physically contiguous memory. AllocateKfdMemory function call + // will fail if this flag is not supported in KFD. + kmt_alloc_flags.ui32.Contiguous = + (alloc_flags & core::MemoryRegion::AllocateContiguous + ? 1 + : kmt_alloc_flags.ui32.Contiguous); + } + + //// Only allow using the suballocator for ordinary VRAM. + if (m_region.IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) { + bool subAllocEnabled = + !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc(); + // Avoid modifying executable or queue allocations. + bool useSubAlloc = subAllocEnabled; + useSubAlloc &= + ((alloc_flags & (~core::MemoryRegion::AllocateRestrict)) == 0); + + if (useSubAlloc) { + *mem = m_region.fragment_alloc(size); + + if ((alloc_flags & core::MemoryRegion::AllocateAsan) && + hsaKmtReplaceAsanHeaderPage(*mem) != HSAKMT_STATUS_SUCCESS) { + m_region.fragment_free(*mem); + *mem = nullptr; + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + return HSA_STATUS_SUCCESS; + } + } + + const uint32_t node_id = + (alloc_flags & core::MemoryRegion::AllocateGTTAccess) + ? agent_node_id + : m_region.owner()->node_id(); + + //// Allocate memory. + //// If it fails attempt to release memory from the block allocator and retry. + *mem = AllocateKfdMemory(kmt_alloc_flags, node_id, size); + if (*mem == nullptr) { + m_region.owner()->Trim(); + *mem = AllocateKfdMemory(kmt_alloc_flags, node_id, size); + } + + if (*mem != nullptr) { + if (kmt_alloc_flags.ui32.NoAddress) + return HSA_STATUS_SUCCESS; + + // Commit the memory. + // For system memory, on non-restricted allocation, map it to all GPUs. On + // restricted allocation, only CPU is allowed to access by default, so + // no need to map + // For local memory, only map it to the owning GPU. Mapping to other GPU, + // if the access is allowed, is performed on AllowAccess. + HsaMemMapFlags map_flag = m_region.map_flags(); + size_t map_node_count = 1; + const uint32_t owner_node_id = m_region.owner()->node_id(); + const uint32_t *map_node_id = &owner_node_id; + + if (m_region.IsSystem()) { + if ((alloc_flags & core::MemoryRegion::AllocateRestrict) == 0) { + // Map to all GPU agents. + map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size(); + + if (map_node_count == 0) { + // No need to pin since no GPU in the platform. + return HSA_STATUS_SUCCESS; + } + + map_node_id = &core::Runtime::runtime_singleton_->gpu_ids()[0]; + } else { + // No need to pin it for CPU exclusive access. + return HSA_STATUS_SUCCESS; + } + } + + uint64_t alternate_va = 0; + const bool is_resident = MakeKfdMemoryResident( + map_node_count, map_node_id, *mem, size, &alternate_va, map_flag); + + const bool require_pinning = + (!m_region.full_profile() || m_region.IsLocalMemory() || + m_region.IsScratch()); + + if (require_pinning && !is_resident) { + FreeKfdMemory(*mem, size); + *mem = nullptr; + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + if ((alloc_flags & core::MemoryRegion::AllocateAsan) && + hsaKmtReplaceAsanHeaderPage(*mem) != HSAKMT_STATUS_SUCCESS) { + FreeKfdMemory(*mem, size); + *mem = nullptr; + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + return HSA_STATUS_SUCCESS; + } + + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } -hsa_status_t KfdDriver::FreeMemory(void *mem, uint32_t node_id) { - return HSA_STATUS_SUCCESS; +hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) { + MakeKfdMemoryUnresident(mem); + return FreeKfdMemory(mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; } hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) { @@ -92,5 +233,45 @@ hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const { return HSA_STATUS_SUCCESS; } +void *KfdDriver::AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id, + size_t size) { + void *mem = nullptr; + const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flags, &mem); + return (status == HSAKMT_STATUS_SUCCESS) ? mem : nullptr; +} + +bool KfdDriver::FreeKfdMemory(void *mem, size_t size) { + if (mem == nullptr || size == 0) { + debug_print("Invalid free ptr:%p size:%lu\n", mem, size); + return true; + } + + if (hsaKmtFreeMemory(mem, size) != HSAKMT_STATUS_SUCCESS) { + debug_print("Failed to free ptr:%p size:%lu\n", mem, size); + return false; + } + return true; +} + +bool KfdDriver::MakeKfdMemoryResident(size_t num_node, const uint32_t *nodes, + const void *mem, size_t size, + uint64_t *alternate_va, + HsaMemMapFlags map_flag) { + assert(num_node > 0); + assert(nodes); + + *alternate_va = 0; + + HSAKMT_STATUS kmt_status(hsaKmtMapMemoryToGPUNodes( + const_cast(mem), size, alternate_va, map_flag, num_node, + const_cast(nodes))); + + return (kmt_status == HSAKMT_STATUS_SUCCESS); +} + +void KfdDriver::MakeKfdMemoryUnresident(const void *mem) { + hsaKmtUnmapMemoryToGPU(const_cast(mem)); +} + } // namespace AMD } // namespace rocr diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index 73788aa519..308ffe7aa9 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -47,6 +47,7 @@ #include #include +#include "core/inc/amd_memory_region.h" #include "core/inc/runtime.h" #include "uapi/amdxdna_accel.h" @@ -89,17 +90,18 @@ hsa_status_t XdnaDriver::QueryKernelModeDriver(core::DriverQuery query) { hsa_status_t XdnaDriver::GetMemoryProperties(uint32_t node_id, - core::MemProperties &mprops) const { + core::MemoryRegion &mem_region) const { return HSA_STATUS_SUCCESS; } -hsa_status_t XdnaDriver::AllocateMemory(void **mem, size_t size, - uint32_t node_id, - core::MemFlags flags) { +hsa_status_t +XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, + core::MemoryRegion::AllocateFlags alloc_flags, + void **mem, size_t size, uint32_t node_id) { return HSA_STATUS_SUCCESS; } -hsa_status_t XdnaDriver::FreeMemory(void *mem, uint32_t node_id) { +hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) { return HSA_STATUS_SUCCESS; } diff --git a/runtime/hsa-runtime/core/inc/agent.h b/runtime/hsa-runtime/core/inc/agent.h index f7396a8844..4d745e90e7 100644 --- a/runtime/hsa-runtime/core/inc/agent.h +++ b/runtime/hsa-runtime/core/inc/agent.h @@ -49,11 +49,12 @@ #include #include "core/inc/checked.h" +#include "core/inc/driver.h" #include "core/inc/isa.h" -#include "core/inc/queue.h" #include "core/inc/memory_region.h" -#include "core/util/utils.h" +#include "core/inc/queue.h" #include "core/util/locks.h" +#include "core/util/utils.h" namespace rocr { @@ -117,19 +118,18 @@ class Agent : public Checked<0xF6BC25EB17E6F917> { // @brief Agent class contructor. // // @param [in] type CPU or GPU or other. - explicit Agent(uint32_t node_id, DeviceType type) - : node_id_(node_id), - device_type_(uint32_t(type)), - profiling_enabled_(false), - enabled_(false) { + explicit Agent(DriverType drv_type, uint32_t node_id, DeviceType type) + : driver_type(drv_type), node_id_(node_id), device_type_(uint32_t(type)), + profiling_enabled_(false), enabled_(false) { public_handle_ = Convert(this); } // @brief Agent class contructor. // // @param [in] type CPU or GPU or other. - explicit Agent(uint32_t node_id, uint32_t type) - : node_id_(node_id), device_type_(type), profiling_enabled_(false) { + explicit Agent(DriverType drv_type, uint32_t node_id, uint32_t type) + : driver_type(drv_type), node_id_(node_id), device_type_(type), + profiling_enabled_(false) { public_handle_ = Convert(this); } @@ -315,7 +315,9 @@ class Agent : public Checked<0xF6BC25EB17E6F917> { for (auto region : regions()) region->Trim(); } - protected: + const DriverType driver_type; + +protected: // Intention here is to have a polymorphic update procedure for public_handle_ // which is callable on any Agent* but only from some class dervied from // Agent*. do_set_public_handle should remain protected or private in all diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index b105dbd6be..4b7c93ec20 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -51,15 +51,16 @@ #include "hsakmt/hsakmt.h" -#include "core/inc/runtime.h" #include "core/inc/agent.h" #include "core/inc/blit.h" -#include "core/inc/signal.h" #include "core/inc/cache.h" +#include "core/inc/driver.h" +#include "core/inc/runtime.h" #include "core/inc/scratch_cache.h" -#include "core/util/small_heap.h" -#include "core/util/locks.h" +#include "core/inc/signal.h" #include "core/util/lazy_ptr.h" +#include "core/util/locks.h" +#include "core/util/small_heap.h" #include "pcs/pcs_runtime.h" namespace rocr { @@ -72,142 +73,154 @@ typedef ScratchCache::ScratchInfo ScratchInfo; class GpuAgentInt : public core::Agent { public: // @brief Constructor - GpuAgentInt(uint32_t node_id) - : core::Agent(node_id,core::Agent::DeviceType::kAmdGpuDevice) {} + GpuAgentInt(uint32_t node_id) + : core::Agent(core::DriverType::KFD, node_id, + core::Agent::DeviceType::kAmdGpuDevice) {} - // @brief Ensure blits are ready (performance hint). - virtual void PreloadBlits() {} + // @brief Ensure blits are ready (performance hint). + virtual void PreloadBlits() {} - // @brief Initialization hook invoked after tools library has loaded, - // to allow tools interception of interface functions. - // - // @retval HSA_STATUS_SUCCESS if initialization is successful. - virtual hsa_status_t PostToolsInit() = 0; + // @brief Initialization hook invoked after tools library has loaded, + // to allow tools interception of interface functions. + // + // @retval HSA_STATUS_SUCCESS if initialization is successful. + virtual hsa_status_t PostToolsInit() = 0; - // @brief Invoke the user provided callback for each region accessible by - // this agent. - // - // @param [in] include_peer If true, the callback will be also invoked on each - // peer memory region accessible by this agent. If false, only invoke the - // callback on memory region owned by this agent. - // @param [in] callback User provided callback function. - // @param [in] data User provided pointer as input for @p callback. - // - // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed - // region returns ::HSA_STATUS_SUCCESS. - virtual hsa_status_t VisitRegion(bool include_peer, - hsa_status_t (*callback)(hsa_region_t region, - void* data), - void* data) const = 0; + // @brief Invoke the user provided callback for each region accessible by + // this agent. + // + // @param [in] include_peer If true, the callback will be also invoked on + // each peer memory region accessible by this agent. If false, only invoke + // the callback on memory region owned by this agent. + // @param [in] callback User provided callback function. + // @param [in] data User provided pointer as input for @p callback. + // + // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed + // region returns ::HSA_STATUS_SUCCESS. + virtual hsa_status_t + VisitRegion(bool include_peer, + hsa_status_t (*callback)(hsa_region_t region, void *data), + void *data) const = 0; - // @brief Carve scratch memory for main from scratch pool. - // - // @param [in/out] scratch Structure to be populated with the carved memory - // information. - virtual void AcquireQueueMainScratch(ScratchInfo& scratch) = 0; + // @brief Carve scratch memory for main from scratch pool. + // + // @param [in/out] scratch Structure to be populated with the carved memory + // information. + virtual void AcquireQueueMainScratch(ScratchInfo &scratch) = 0; - // @brief Carve scratch memory for alt from scratch pool. - // - // @param [in/out] scratch Structure to be populated with the carved memory - // information. - virtual void AcquireQueueAltScratch(ScratchInfo& scratch) = 0; + // @brief Carve scratch memory for alt from scratch pool. + // + // @param [in/out] scratch Structure to be populated with the carved memory + // information. + virtual void AcquireQueueAltScratch(ScratchInfo &scratch) = 0; - // @brief Release scratch memory from main back to scratch pool. - // - // @param [in/out] scratch Scratch memory previously acquired with call to - // ::AcquireQueueMainScratch. - virtual void ReleaseQueueMainScratch(ScratchInfo& base) = 0; + // @brief Release scratch memory from main back to scratch pool. + // + // @param [in/out] scratch Scratch memory previously acquired with call to + // ::AcquireQueueMainScratch. + virtual void ReleaseQueueMainScratch(ScratchInfo &base) = 0; - // @brief Release scratch memory back from alternate to scratch pool. - // - // @param [in/out] scratch Scratch memory previously acquired with call to - // ::AcquireQueueAltcratch. - virtual void ReleaseQueueAltScratch(ScratchInfo& base) = 0; + // @brief Release scratch memory back from alternate to scratch pool. + // + // @param [in/out] scratch Scratch memory previously acquired with call to + // ::AcquireQueueAltcratch. + virtual void ReleaseQueueAltScratch(ScratchInfo &base) = 0; - // @brief Translate the kernel start and end dispatch timestamp from agent - // domain to host domain. - // - // @param [in] signal Pointer to signal that provides the dispatch timing. - // @param [out] time Structure to be populated with the host domain value. - virtual void TranslateTime(core::Signal* signal, - hsa_amd_profiling_dispatch_time_t& time) = 0; + // @brief Translate the kernel start and end dispatch timestamp from agent + // domain to host domain. + // + // @param [in] signal Pointer to signal that provides the dispatch timing. + // @param [out] time Structure to be populated with the host domain value. + virtual void TranslateTime(core::Signal *signal, + hsa_amd_profiling_dispatch_time_t &time) = 0; - // @brief Translate the async copy start and end timestamp from agent - // domain to host domain. - // - // @param [in] signal Pointer to signal that provides the async copy timing. - // @param [out] time Structure to be populated with the host domain value. - virtual void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) = 0; + // @brief Translate the async copy start and end timestamp from agent + // domain to host domain. + // + // @param [in] signal Pointer to signal that provides the async copy timing. + // @param [out] time Structure to be populated with the host domain value. + virtual void TranslateTime(core::Signal *signal, + hsa_amd_profiling_async_copy_time_t &time) = 0; - // @brief Translate timestamp agent domain to host domain. - // - // @param [out] time Timestamp in agent domain. - virtual uint64_t TranslateTime(uint64_t tick) = 0; + // @brief Translate timestamp agent domain to host domain. + // + // @param [out] time Timestamp in agent domain. + virtual uint64_t TranslateTime(uint64_t tick) = 0; - // @brief Invalidate caches on the agent which may hold code object data. - virtual void InvalidateCodeCaches() = 0; + // @brief Invalidate caches on the agent which may hold code object data. + virtual void InvalidateCodeCaches() = 0; - // @brief Sets the coherency type of this agent. - // - // @param [in] type New coherency type. - // - // @retval true The new coherency type is set successfuly. - virtual bool current_coherency_type(hsa_amd_coherency_type_t type) = 0; + // @brief Sets the coherency type of this agent. + // + // @param [in] type New coherency type. + // + // @retval true The new coherency type is set successfuly. + virtual bool current_coherency_type(hsa_amd_coherency_type_t type) = 0; - // @brief Returns the current coherency type of this agent. - // - // @retval Coherency type. - virtual hsa_amd_coherency_type_t current_coherency_type() const = 0; + // @brief Returns the current coherency type of this agent. + // + // @retval Coherency type. + virtual hsa_amd_coherency_type_t current_coherency_type() const = 0; - virtual void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) = 0; + virtual void RegisterGangPeer(core::Agent &gang_peer, + unsigned int bandwidth_factor) = 0; - virtual void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) = 0; + virtual void RegisterRecSdmaEngIdMaskPeer(core::Agent &gang_peer, + uint32_t rec_sdma_eng_id_mask) = 0; - // @brief Query if agent represent Kaveri GPU. - // - // @retval true if agent is Kaveri GPU. - virtual bool is_kv_device() const = 0; + // @brief Query if agent represent Kaveri GPU. + // + // @retval true if agent is Kaveri GPU. + virtual bool is_kv_device() const = 0; - // @brief Query the agent HSA profile. - // - // @retval HSA profile. - virtual hsa_profile_t profile() const = 0; + // @brief Query the agent HSA profile. + // + // @retval HSA profile. + virtual hsa_profile_t profile() const = 0; - // @brief Query the agent memory bus width in bit. - // - // @retval Bus width in bit. - virtual uint32_t memory_bus_width() const = 0; + // @brief Query the agent memory bus width in bit. + // + // @retval Bus width in bit. + virtual uint32_t memory_bus_width() const = 0; - // @brief Query the agent memory maximum frequency in MHz. - // - // @retval Bus width in MHz. - virtual uint32_t memory_max_frequency() const = 0; + // @brief Query the agent memory maximum frequency in MHz. + // + // @retval Bus width in MHz. + virtual uint32_t memory_max_frequency() const = 0; - // @brief Whether agent supports asynchronous scratch reclaim. Depends on CP FW - virtual bool AsyncScratchReclaimEnabled() const = 0; + // @brief Whether agent supports asynchronous scratch reclaim. Depends on CP + // FW + virtual bool AsyncScratchReclaimEnabled() const = 0; - // @brief Update the agent's scratch use-once threshold. - // Only valid when async scratch reclaim is supported - // @retval HSA_STATUS_SUCCESS if successful - virtual hsa_status_t SetAsyncScratchThresholds(size_t use_once_limit) = 0; + // @brief Update the agent's scratch use-once threshold. + // Only valid when async scratch reclaim is supported + // @retval HSA_STATUS_SUCCESS if successful + virtual hsa_status_t SetAsyncScratchThresholds(size_t use_once_limit) = 0; - // @brief Iterate through supported PC Sampling configurations - // @retval HSA_STATUS_SUCCESS if successful - virtual hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb, - void* cb_data) = 0; + // @brief Iterate through supported PC Sampling configurations + // @retval HSA_STATUS_SUCCESS if successful + virtual hsa_status_t + PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb, + void *cb_data) = 0; - virtual hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) = 0; + virtual hsa_status_t + PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession &session) = 0; - virtual hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId, - pcs::PcsRuntime::PcSamplingSession& session) = 0; + virtual hsa_status_t + PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId, + pcs::PcsRuntime::PcSamplingSession &session) = 0; - virtual hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) = 0; + virtual hsa_status_t + PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession &session) = 0; - virtual hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) = 0; + virtual hsa_status_t + PcSamplingStart(pcs::PcsRuntime::PcSamplingSession &session) = 0; - virtual hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) = 0; + virtual hsa_status_t + PcSamplingStop(pcs::PcsRuntime::PcSamplingSession &session) = 0; - virtual hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) = 0; + virtual hsa_status_t + PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession &session) = 0; }; class GpuAgent : public GpuAgentInt { diff --git a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h index 40fdf91779..1939c0511b 100644 --- a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h @@ -43,11 +43,21 @@ #ifndef HSA_RUNTIME_CORE_INC_AMD_KFD_DRIVER_H_ #define HSA_RUNTIME_CORE_INC_AMD_KFD_DRIVER_H_ -#include "core/inc/driver.h" - #include +#include "hsakmt/hsakmt.h" + +#include "core/inc/driver.h" +#include "core/inc/memory_region.h" + namespace rocr { + +namespace core { + +class Queue; + +} + namespace AMD { class KfdDriver : public core::Driver { @@ -57,13 +67,33 @@ public: static hsa_status_t DiscoverDriver(); hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override; - hsa_status_t GetMemoryProperties(uint32_t node_id, - core::MemProperties &mprops) const override; - hsa_status_t AllocateMemory(void **mem, size_t size, uint32_t node_id, - core::MemFlags flags) override; - hsa_status_t FreeMemory(void *mem, uint32_t node_id) override; + hsa_status_t + GetMemoryProperties(uint32_t node_id, + core::MemoryRegion &mem_region) const override; + hsa_status_t AllocateMemory(const core::MemoryRegion &mem_region, + core::MemoryRegion::AllocateFlags alloc_flags, + void **mem, size_t size, + uint32_t node_id) override; + hsa_status_t FreeMemory(void *mem, size_t size) override; hsa_status_t CreateQueue(core::Queue &queue) override; hsa_status_t DestroyQueue(core::Queue &queue) const override; + +private: + /// @brief Allocate agent accessible memory (system / local memory). + static void *AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id, + size_t size); + + /// @brief Free agent accessible memory (system / local memory). + static bool FreeKfdMemory(void *mem, size_t size); + + /// @brief Pin memory. + static bool MakeKfdMemoryResident(size_t num_node, const uint32_t *nodes, + const void *mem, size_t size, + uint64_t *alternate_va, + HsaMemMapFlags map_flag); + + /// @brief Unpin memory. + static void MakeKfdMemoryUnresident(const void *mem); }; } // namespace AMD diff --git a/runtime/hsa-runtime/core/inc/amd_memory_region.h b/runtime/hsa-runtime/core/inc/amd_memory_region.h index 49fdac348c..b052d5c386 100644 --- a/runtime/hsa-runtime/core/inc/amd_memory_region.h +++ b/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -77,13 +77,6 @@ class MemoryRegion : public core::MemoryRegion { return reinterpret_cast(region.handle); } - /// @brief Allocate agent accessible memory (system / local memory). - static void* AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, - size_t size); - - /// @brief Free agent accessible memory (system / local memory). - static bool FreeKfdMemory(void* ptr, size_t size); - static bool RegisterMemory(void* ptr, size_t size, const HsaMemFlags& MemFlags); static void DeregisterMemory(void* ptr); @@ -175,7 +168,15 @@ class MemoryRegion : public core::MemoryRegion { __forceinline size_t GetPageSize() const { return kPageSize_; } - private: + __forceinline const HsaMemFlags &mem_flags() const { return mem_flag_; } + __forceinline const HsaMemMapFlags &map_flags() const { return map_flag_; } + + void *fragment_alloc(size_t size) const { + return fragment_allocator_.alloc(size); + } + bool fragment_free(void *mem) const { return fragment_allocator_.free(mem); } + +private: const HsaMemoryProperties mem_props_; HsaMemFlags mem_flag_; diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 5fe7eae315..c45b33b11a 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -45,8 +45,13 @@ #include #include "core/inc/driver.h" +#include "core/inc/memory_region.h" namespace rocr { +namespace core { +class Queue; +} + namespace AMD { class XdnaDriver : public core::Driver { @@ -57,11 +62,14 @@ public: static hsa_status_t DiscoverDriver(); hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override; - hsa_status_t GetMemoryProperties(uint32_t node_id, - core::MemProperties &mprops) const override; - hsa_status_t AllocateMemory(void **mem, size_t size, uint32_t node_id, - core::MemFlags flags) override; - hsa_status_t FreeMemory(void *mem, uint32_t node_id) override; + hsa_status_t + GetMemoryProperties(uint32_t node_id, + core::MemoryRegion &mem_region) const override; + hsa_status_t AllocateMemory(const core::MemoryRegion &mem_region, + core::MemoryRegion::AllocateFlags alloc_flags, + void **mem, size_t size, + uint32_t node_id) override; + hsa_status_t FreeMemory(void *mem, size_t size) override; hsa_status_t CreateQueue(core::Queue &queue) override; hsa_status_t DestroyQueue(core::Queue &queue) const override; diff --git a/runtime/hsa-runtime/core/inc/driver.h b/runtime/hsa-runtime/core/inc/driver.h index 22f5f3d0a1..8c22a39b01 100644 --- a/runtime/hsa-runtime/core/inc/driver.h +++ b/runtime/hsa-runtime/core/inc/driver.h @@ -46,20 +46,13 @@ #include #include -#include "core/inc/agent.h" #include "core/inc/memory_region.h" #include "inc/hsa.h" namespace rocr { namespace core { -using MemFlags = uint32_t; - -struct MemProperties { - MemFlags flags_; - size_t size_bytes_; - uint64_t virtual_base_addr_; -}; +class Queue; struct DriverVersionInfo { uint32_t major; @@ -85,17 +78,27 @@ class Driver { /// @retval HSA_STATUS_SUCCESS if the kernel-model driver query was /// successful. virtual hsa_status_t QueryKernelModeDriver(DriverQuery query) = 0; + /// @brief Open a connection to the driver using name_. /// @retval HSA_STATUS_SUCCESS if the driver was opened successfully. hsa_status_t Open(); + /// @brief Close a connection to the open driver using fd_. /// @retval HSA_STATUS_SUCCESS if the driver was opened successfully. hsa_status_t Close(); + /// @brief Get driver version information. /// @retval DriverVersionInfo containing the driver's version information. - DriverVersionInfo Version() const { return version_; } + const DriverVersionInfo &Version() const { return version_; } - virtual hsa_status_t GetMemoryProperties(uint32_t node_id, MemProperties &mprops) const = 0; + /// @brief Get the memory properties of a specific node. + /// @param node_id Node ID of the agent + /// @param[in, out] mem_region MemoryRegion object whose properties will be + /// retrieved. + /// @retval HSA_STATUS_SUCCESS if the driver sucessfully returns the node's + /// memory properties. + virtual hsa_status_t GetMemoryProperties(uint32_t node_id, + MemoryRegion &mem_region) const = 0; /// @brief Allocate agent-accessible memory (system or agent-local memory). /// @@ -103,10 +106,12 @@ class Driver { /// /// @retval HSA_STATUS_SUCCESS if memory was successfully allocated or /// hsa_status_t error code if the memory allocation failed. - virtual hsa_status_t AllocateMemory(void** mem, size_t size, uint32_t node_id, - MemFlags flags) = 0; + virtual hsa_status_t AllocateMemory(const MemoryRegion &mem_region, + MemoryRegion::AllocateFlags alloc_flags, + void **mem, size_t size, + uint32_t node_id) = 0; - virtual hsa_status_t FreeMemory(void* mem, uint32_t node_id) = 0; + virtual hsa_status_t FreeMemory(void *mem, size_t size) = 0; virtual hsa_status_t CreateQueue(Queue &queue) = 0; diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp index 6e92211a7e..b98de4da25 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp @@ -43,12 +43,14 @@ #include "core/inc/amd_aie_agent.h" #include "core/inc/amd_aie_aql_queue.h" +#include "core/inc/driver.h" namespace rocr { namespace AMD { AieAgent::AieAgent(uint32_t node) - : core::Agent(node, core::Agent::DeviceType::kAmdAieDevice), + : core::Agent(core::DriverType::XDNA, node, + core::Agent::DeviceType::kAmdAieDevice), max_queues_(core::Runtime::runtime_singleton_->flag().max_queues()) { InitRegionList(); } diff --git a/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp index df473d4219..a52df4c58a 100644 --- a/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp @@ -47,14 +47,16 @@ #include #include "core/inc/amd_memory_region.h" +#include "core/inc/driver.h" #include "core/inc/host_queue.h" #include "inc/hsa_ext_image.h" namespace rocr { namespace AMD { -CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) - : core::Agent(node, kAmdCpuDevice), properties_(node_props) { +CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties &node_props) + : core::Agent(core::DriverType::KFD, node, kAmdCpuDevice), + properties_(node_props) { InitRegionList(); InitCacheList(); diff --git a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index d24d6d717a..d54ff4b4f5 100644 --- a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -59,25 +59,6 @@ namespace AMD { size_t MemoryRegion::max_sysmem_alloc_size_ = 0; size_t MemoryRegion::kPageSize_ = sysconf(_SC_PAGESIZE); -void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, size_t size) { - void* ret = NULL; - const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flag, &ret); - return (status == HSAKMT_STATUS_SUCCESS) ? ret : NULL; -} - -bool MemoryRegion::FreeKfdMemory(void* ptr, size_t size) { - if (ptr == NULL || size == 0) { - debug_print("Invalid free ptr:%p size:%lu\n", ptr, size); - return true; - } - - if (hsaKmtFreeMemory(ptr, size) != HSAKMT_STATUS_SUCCESS) { - debug_print("Failed to free ptr:%p size:%lu\n", ptr, size); - return false; - } - return true; -} - bool MemoryRegion::RegisterMemory(void* ptr, size_t size, const HsaMemFlags& MemFlags) { assert(ptr != NULL); assert(size != 0); @@ -198,112 +179,8 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags, size = AlignUp(size, kPageSize_); - HsaMemFlags kmt_alloc_flags(mem_flag_); - kmt_alloc_flags.ui32.ExecuteAccess = - (alloc_flags & AllocateExecutable ? 1 : 0); - kmt_alloc_flags.ui32.AQLQueueMemory = - (alloc_flags & AllocateDoubleMap ? 1 : 0); - if (IsSystem() && (alloc_flags & AllocateNonPaged)) - kmt_alloc_flags.ui32.NonPaged = 1; - - // Allocating a memory handle for virtual memory - kmt_alloc_flags.ui32.NoAddress = !!(alloc_flags & AllocateMemoryOnly); - - // Allocate pseudo fine grain memory - kmt_alloc_flags.ui32.CoarseGrain = (alloc_flags & AllocatePCIeRW ? 0 : kmt_alloc_flags.ui32.CoarseGrain); - kmt_alloc_flags.ui32.NoSubstitute = (alloc_flags & AllocatePinned ? 1 : kmt_alloc_flags.ui32.NoSubstitute); - - kmt_alloc_flags.ui32.GTTAccess = (alloc_flags & AllocateGTTAccess ? 1 : kmt_alloc_flags.ui32.GTTAccess); - if (IsLocalMemory()) { - // Allocate physically contiguous memory - AllocateKfdMemory function call will fail - // if this flag is not supported in KFD. - kmt_alloc_flags.ui32.Contiguous = - (alloc_flags & AllocateContiguous ? 1 : kmt_alloc_flags.ui32.Contiguous); - } - - // Only allow using the suballocator for ordinary VRAM. - if (IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) { - bool subAllocEnabled = !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc(); - // Avoid modifying executable or queue allocations. - bool useSubAlloc = subAllocEnabled; - useSubAlloc &= ((alloc_flags & (~AllocateRestrict)) == 0); - if (useSubAlloc) { - *address = fragment_allocator_.alloc(size); - - if ((alloc_flags & AllocateAsan) && - hsaKmtReplaceAsanHeaderPage(*address) != HSAKMT_STATUS_SUCCESS) { - fragment_allocator_.free(*address); - *address = NULL; - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - return HSA_STATUS_SUCCESS; - } - } - - const HSAuint32 node_id = (alloc_flags & AllocateGTTAccess) ? agent_node_id : owner()->node_id(); - - // Allocate memory. - // If it fails attempt to release memory from the block allocator and retry. - *address = AllocateKfdMemory(kmt_alloc_flags, node_id, size); - if (*address == nullptr) { - owner()->Trim(); - *address = AllocateKfdMemory(kmt_alloc_flags, node_id, size); - } - - if (*address != nullptr) { - if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS; - - // Commit the memory. - // For system memory, on non-restricted allocation, map it to all GPUs. On - // restricted allocation, only CPU is allowed to access by default, so - // no need to map - // For local memory, only map it to the owning GPU. Mapping to other GPU, - // if the access is allowed, is performed on AllowAccess. - HsaMemMapFlags map_flag = map_flag_; - size_t map_node_count = 1; - const uint32_t owner_node_id = owner()->node_id(); - const uint32_t* map_node_id = &owner_node_id; - - if (IsSystem()) { - if ((alloc_flags & AllocateRestrict) == 0) { - // Map to all GPU agents. - map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size(); - - if (map_node_count == 0) { - // No need to pin since no GPU in the platform. - return HSA_STATUS_SUCCESS; - } - - map_node_id = &core::Runtime::runtime_singleton_->gpu_ids()[0]; - } else { - // No need to pin it for CPU exclusive access. - return HSA_STATUS_SUCCESS; - } - } - - uint64_t alternate_va = 0; - const bool is_resident = MakeKfdMemoryResident( - map_node_count, map_node_id, *address, size, &alternate_va, map_flag); - - const bool require_pinning = - (!full_profile() || IsLocalMemory() || IsScratch()); - - if (require_pinning && !is_resident) { - FreeKfdMemory(*address, size); - *address = NULL; - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - - if ((alloc_flags & AllocateAsan) && - hsaKmtReplaceAsanHeaderPage(*address) != HSAKMT_STATUS_SUCCESS) { - FreeKfdMemory(*address, size); - *address = NULL; - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - return HSA_STATUS_SUCCESS; - } - - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + return core::Runtime::runtime_singleton_->AgentDriver(owner()->driver_type) + .AllocateMemory(*this, alloc_flags, address, size, agent_node_id); } hsa_status_t MemoryRegion::Free(void* address, size_t size) const { @@ -314,9 +191,8 @@ hsa_status_t MemoryRegion::Free(void* address, size_t size) const { hsa_status_t MemoryRegion::FreeImpl(void* address, size_t size) const { if (fragment_allocator_.free(address)) return HSA_STATUS_SUCCESS; - MakeKfdMemoryUnresident(address); - - return FreeKfdMemory(address, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + return core::Runtime::runtime_singleton_->AgentDriver(owner()->driver_type) + .FreeMemory(address, size); } // TODO: Look into a better name and/or making this process transparent to exporting.