rocr: Generalize AMD::MemoryRegion Allocate and Free

Remove KFD-specific Allocate/Free calls from the AMD::MemoryRegion. The KFD-driver-specific Allocate/Free calls are now implemented in the KfdDriver. Future changes will migrate the remaining KFD-specific calls out of AMD::MemoryRegion. This allows the MemoryRegion to be used across AMD drivers like the XDNA driver. Change-Id: Ib6a2a9e5e1a15e61644d2592beb3a8e6578c3010
2024-08-19 15:46:36 +00:00
@@ -49,6 +49,10 @@

 #include "hsakmt/hsakmt.h"

+#include "core/inc/amd_cpu_agent.h"
+#include "core/inc/amd_gpu_agent.h"
+#include "core/inc/amd_memory_region.h"
+#include "core/inc/exceptions.h"
 #include "core/inc/runtime.h"

 namespace rocr {
@@ -70,18 +74,155 @@ hsa_status_t KfdDriver::QueryKernelModeDriver(core::DriverQuery query) {
  return HSA_STATUS_SUCCESS;
 }

-hsa_status_t KfdDriver::GetMemoryProperties(uint32_t node_id,
-                                            core::MemProperties &mprops) const {
+hsa_status_t
+KfdDriver::GetMemoryProperties(uint32_t node_id,
+                               core::MemoryRegion &mem_region) const {
  return HSA_STATUS_SUCCESS;
 }

-hsa_status_t KfdDriver::AllocateMemory(void **mem, size_t size,
-                                       uint32_t node_id, core::MemFlags flags) {
-  return HSA_STATUS_SUCCESS;
+hsa_status_t
+KfdDriver::AllocateMemory(const core::MemoryRegion &mem_region,
+                          core::MemoryRegion::AllocateFlags alloc_flags,
+                          void **mem, size_t size, uint32_t agent_node_id) {
+  const MemoryRegion &m_region(static_cast<const MemoryRegion &>(mem_region));
+  HsaMemFlags kmt_alloc_flags(m_region.mem_flags());
+
+  kmt_alloc_flags.ui32.ExecuteAccess =
+      (alloc_flags & core::MemoryRegion::AllocateExecutable ? 1 : 0);
+  kmt_alloc_flags.ui32.AQLQueueMemory =
+      (alloc_flags & core::MemoryRegion::AllocateDoubleMap ? 1 : 0);
+
+  if (m_region.IsSystem() &&
+      (alloc_flags & core::MemoryRegion::AllocateNonPaged)) {
+    kmt_alloc_flags.ui32.NonPaged = 1;
+  }
+
+  // Allocating a memory handle for virtual memory
+  kmt_alloc_flags.ui32.NoAddress =
+      !!(alloc_flags & core::MemoryRegion::AllocateMemoryOnly);
+
+  // Allocate pseudo fine grain memory
+  kmt_alloc_flags.ui32.CoarseGrain =
+      (alloc_flags & core::MemoryRegion::AllocatePCIeRW
+           ? 0
+           : kmt_alloc_flags.ui32.CoarseGrain);
+
+  kmt_alloc_flags.ui32.NoSubstitute =
+      (alloc_flags & core::MemoryRegion::AllocatePinned
+           ? 1
+           : kmt_alloc_flags.ui32.NoSubstitute);
+
+  kmt_alloc_flags.ui32.GTTAccess =
+      (alloc_flags & core::MemoryRegion::AllocateGTTAccess
+           ? 1
+           : kmt_alloc_flags.ui32.GTTAccess);
+
+  if (m_region.IsLocalMemory()) {
+    // Allocate physically contiguous memory. AllocateKfdMemory function call
+    // will fail if this flag is not supported in KFD.
+    kmt_alloc_flags.ui32.Contiguous =
+        (alloc_flags & core::MemoryRegion::AllocateContiguous
+             ? 1
+             : kmt_alloc_flags.ui32.Contiguous);
+  }
+
+  //// Only allow using the suballocator for ordinary VRAM.
+  if (m_region.IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
+    bool subAllocEnabled =
+        !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc();
+    // Avoid modifying executable or queue allocations.
+    bool useSubAlloc = subAllocEnabled;
+    useSubAlloc &=
+        ((alloc_flags & (~core::MemoryRegion::AllocateRestrict)) == 0);
+
+    if (useSubAlloc) {
+      *mem = m_region.fragment_alloc(size);
+
+      if ((alloc_flags & core::MemoryRegion::AllocateAsan) &&
+          hsaKmtReplaceAsanHeaderPage(*mem) != HSAKMT_STATUS_SUCCESS) {
+        m_region.fragment_free(*mem);
+        *mem = nullptr;
+        return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+      }
+
+      return HSA_STATUS_SUCCESS;
+    }
+  }
+
+  const uint32_t node_id =
+      (alloc_flags & core::MemoryRegion::AllocateGTTAccess)
+          ? agent_node_id
+          : m_region.owner()->node_id();
+
+  //// Allocate memory.
+  //// If it fails attempt to release memory from the block allocator and retry.
+  *mem = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
+  if (*mem == nullptr) {
+    m_region.owner()->Trim();
+    *mem = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
+  }
+
+  if (*mem != nullptr) {
+    if (kmt_alloc_flags.ui32.NoAddress)
+      return HSA_STATUS_SUCCESS;
+
+    // Commit the memory.
+    // For system memory, on non-restricted allocation, map it to all GPUs. On
+    // restricted allocation, only CPU is allowed to access by default, so
+    // no need to map
+    // For local memory, only map it to the owning GPU. Mapping to other GPU,
+    // if the access is allowed, is performed on AllowAccess.
+    HsaMemMapFlags map_flag = m_region.map_flags();
+    size_t map_node_count = 1;
+    const uint32_t owner_node_id = m_region.owner()->node_id();
+    const uint32_t *map_node_id = &owner_node_id;
+
+    if (m_region.IsSystem()) {
+      if ((alloc_flags & core::MemoryRegion::AllocateRestrict) == 0) {
+        // Map to all GPU agents.
+        map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size();
+
+        if (map_node_count == 0) {
+          // No need to pin since no GPU in the platform.
+          return HSA_STATUS_SUCCESS;
+        }
+
+        map_node_id = &core::Runtime::runtime_singleton_->gpu_ids()[0];
+      } else {
+        // No need to pin it for CPU exclusive access.
+        return HSA_STATUS_SUCCESS;
+      }
+    }
+
+    uint64_t alternate_va = 0;
+    const bool is_resident = MakeKfdMemoryResident(
+        map_node_count, map_node_id, *mem, size, &alternate_va, map_flag);
+
+    const bool require_pinning =
+        (!m_region.full_profile() || m_region.IsLocalMemory() ||
+         m_region.IsScratch());
+
+    if (require_pinning && !is_resident) {
+      FreeKfdMemory(*mem, size);
+      *mem = nullptr;
+      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+    }
+
+    if ((alloc_flags & core::MemoryRegion::AllocateAsan) &&
+        hsaKmtReplaceAsanHeaderPage(*mem) != HSAKMT_STATUS_SUCCESS) {
+      FreeKfdMemory(*mem, size);
+      *mem = nullptr;
+      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+    }
+    return HSA_STATUS_SUCCESS;
+  }
+
+  return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
 }

-hsa_status_t KfdDriver::FreeMemory(void *mem, uint32_t node_id) {
-  return HSA_STATUS_SUCCESS;
+hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) {
+  MakeKfdMemoryUnresident(mem);
+  return FreeKfdMemory(mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
 }

 hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) {
@@ -92,5 +233,45 @@ hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const {
  return HSA_STATUS_SUCCESS;
 }

+void *KfdDriver::AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id,
+                                   size_t size) {
+  void *mem = nullptr;
+  const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flags, &mem);
+  return (status == HSAKMT_STATUS_SUCCESS) ? mem : nullptr;
+}
+
+bool KfdDriver::FreeKfdMemory(void *mem, size_t size) {
+  if (mem == nullptr || size == 0) {
+    debug_print("Invalid free ptr:%p size:%lu\n", mem, size);
+    return true;
+  }
+
+  if (hsaKmtFreeMemory(mem, size) != HSAKMT_STATUS_SUCCESS) {
+    debug_print("Failed to free ptr:%p size:%lu\n", mem, size);
+    return false;
+  }
+  return true;
+}
+
+bool KfdDriver::MakeKfdMemoryResident(size_t num_node, const uint32_t *nodes,
+                                      const void *mem, size_t size,
+                                      uint64_t *alternate_va,
+                                      HsaMemMapFlags map_flag) {
+  assert(num_node > 0);
+  assert(nodes);
+
+  *alternate_va = 0;
+
+  HSAKMT_STATUS kmt_status(hsaKmtMapMemoryToGPUNodes(
+      const_cast<void *>(mem), size, alternate_va, map_flag, num_node,
+      const_cast<uint32_t *>(nodes)));
+
+  return (kmt_status == HSAKMT_STATUS_SUCCESS);
+}
+
+void KfdDriver::MakeKfdMemoryUnresident(const void *mem) {
+  hsaKmtUnmapMemoryToGPU(const_cast<void *>(mem));
+}
+
 } // namespace AMD
 } // namespace rocr
@@ -47,6 +47,7 @@
 #include <memory>
 #include <string>

+#include "core/inc/amd_memory_region.h"
 #include "core/inc/runtime.h"
 #include "uapi/amdxdna_accel.h"

@@ -89,17 +90,18 @@ hsa_status_t XdnaDriver::QueryKernelModeDriver(core::DriverQuery query) {

 hsa_status_t
 XdnaDriver::GetMemoryProperties(uint32_t node_id,
-                                core::MemProperties &mprops) const {
+                                core::MemoryRegion &mem_region) const {
  return HSA_STATUS_SUCCESS;
 }

-hsa_status_t XdnaDriver::AllocateMemory(void **mem, size_t size,
-                                        uint32_t node_id,
-                                        core::MemFlags flags) {
+hsa_status_t
+XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
+                           core::MemoryRegion::AllocateFlags alloc_flags,
+                           void **mem, size_t size, uint32_t node_id) {
  return HSA_STATUS_SUCCESS;
 }

-hsa_status_t XdnaDriver::FreeMemory(void *mem, uint32_t node_id) {
+hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
  return HSA_STATUS_SUCCESS;
 }

@@ -49,11 +49,12 @@
 #include <vector>

 #include "core/inc/checked.h"
+#include "core/inc/driver.h"
 #include "core/inc/isa.h"
-#include "core/inc/queue.h"
 #include "core/inc/memory_region.h"
-#include "core/util/utils.h"
+#include "core/inc/queue.h"
 #include "core/util/locks.h"
+#include "core/util/utils.h"

 namespace rocr {

@@ -117,19 +118,18 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
  // @brief Agent class contructor.
  //
  // @param [in] type CPU or GPU or other.
-  explicit Agent(uint32_t node_id, DeviceType type)
-      : node_id_(node_id),
-        device_type_(uint32_t(type)),
-        profiling_enabled_(false),
-        enabled_(false) {
+  explicit Agent(DriverType drv_type, uint32_t node_id, DeviceType type)
+      : driver_type(drv_type), node_id_(node_id), device_type_(uint32_t(type)),
+        profiling_enabled_(false), enabled_(false) {
    public_handle_ = Convert(this);
  }

  // @brief Agent class contructor.
  //
  // @param [in] type CPU or GPU or other.
-  explicit Agent(uint32_t node_id, uint32_t type)
-      : node_id_(node_id), device_type_(type), profiling_enabled_(false) {
+  explicit Agent(DriverType drv_type, uint32_t node_id, uint32_t type)
+      : driver_type(drv_type), node_id_(node_id), device_type_(type),
+        profiling_enabled_(false) {
    public_handle_ = Convert(this);
  }

@@ -315,7 +315,9 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
    for (auto region : regions()) region->Trim();
  }

- protected:
+  const DriverType driver_type;
+
+protected:
  // Intention here is to have a polymorphic update procedure for public_handle_
  // which is callable on any Agent* but only from some class dervied from
  // Agent*.  do_set_public_handle should remain protected or private in all
@@ -51,15 +51,16 @@

 #include "hsakmt/hsakmt.h"

-#include "core/inc/runtime.h"
 #include "core/inc/agent.h"
 #include "core/inc/blit.h"
-#include "core/inc/signal.h"
 #include "core/inc/cache.h"
+#include "core/inc/driver.h"
+#include "core/inc/runtime.h"
 #include "core/inc/scratch_cache.h"
-#include "core/util/small_heap.h"
-#include "core/util/locks.h"
+#include "core/inc/signal.h"
 #include "core/util/lazy_ptr.h"
+#include "core/util/locks.h"
+#include "core/util/small_heap.h"
 #include "pcs/pcs_runtime.h"

 namespace rocr {
@@ -72,142 +73,154 @@ typedef ScratchCache::ScratchInfo ScratchInfo;
 class GpuAgentInt : public core::Agent {
 public:
  // @brief Constructor
-  GpuAgentInt(uint32_t node_id)
-      : core::Agent(node_id,core::Agent::DeviceType::kAmdGpuDevice) {}
+   GpuAgentInt(uint32_t node_id)
+       : core::Agent(core::DriverType::KFD, node_id,
+                     core::Agent::DeviceType::kAmdGpuDevice) {}

-  // @brief Ensure blits are ready (performance hint).
-  virtual void PreloadBlits() {}
+   // @brief Ensure blits are ready (performance hint).
+   virtual void PreloadBlits() {}

-  // @brief Initialization hook invoked after tools library has loaded,
-  // to allow tools interception of interface functions.
-  //
-  // @retval HSA_STATUS_SUCCESS if initialization is successful.
-  virtual hsa_status_t PostToolsInit() = 0;
+   // @brief Initialization hook invoked after tools library has loaded,
+   // to allow tools interception of interface functions.
+   //
+   // @retval HSA_STATUS_SUCCESS if initialization is successful.
+   virtual hsa_status_t PostToolsInit() = 0;

-  // @brief Invoke the user provided callback for each region accessible by
-  // this agent.
-  //
-  // @param [in] include_peer If true, the callback will be also invoked on each
-  // peer memory region accessible by this agent. If false, only invoke the
-  // callback on memory region owned by this agent.
-  // @param [in] callback User provided callback function.
-  // @param [in] data User provided pointer as input for @p callback.
-  //
-  // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
-  // region returns ::HSA_STATUS_SUCCESS.
-  virtual hsa_status_t VisitRegion(bool include_peer,
-                                   hsa_status_t (*callback)(hsa_region_t region,
-                                                            void* data),
-                                   void* data) const = 0;
+   // @brief Invoke the user provided callback for each region accessible by
+   // this agent.
+   //
+   // @param [in] include_peer If true, the callback will be also invoked on
+   // each peer memory region accessible by this agent. If false, only invoke
+   // the callback on memory region owned by this agent.
+   // @param [in] callback User provided callback function.
+   // @param [in] data User provided pointer as input for @p callback.
+   //
+   // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
+   // region returns ::HSA_STATUS_SUCCESS.
+   virtual hsa_status_t
+   VisitRegion(bool include_peer,
+               hsa_status_t (*callback)(hsa_region_t region, void *data),
+               void *data) const = 0;

-  // @brief Carve scratch memory for main from scratch pool.
-  //
-  // @param [in/out] scratch Structure to be populated with the carved memory
-  // information.
-  virtual void AcquireQueueMainScratch(ScratchInfo& scratch) = 0;
+   // @brief Carve scratch memory for main from scratch pool.
+   //
+   // @param [in/out] scratch Structure to be populated with the carved memory
+   // information.
+   virtual void AcquireQueueMainScratch(ScratchInfo &scratch) = 0;

-  // @brief Carve scratch memory for alt from scratch pool.
-  //
-  // @param [in/out] scratch Structure to be populated with the carved memory
-  // information.
-  virtual void AcquireQueueAltScratch(ScratchInfo& scratch) = 0;
+   // @brief Carve scratch memory for alt from scratch pool.
+   //
+   // @param [in/out] scratch Structure to be populated with the carved memory
+   // information.
+   virtual void AcquireQueueAltScratch(ScratchInfo &scratch) = 0;

-  // @brief Release scratch memory from main back to scratch pool.
-  //
-  // @param [in/out] scratch Scratch memory previously acquired with call to
-  // ::AcquireQueueMainScratch.
-  virtual void ReleaseQueueMainScratch(ScratchInfo& base) = 0;
+   // @brief Release scratch memory from main back to scratch pool.
+   //
+   // @param [in/out] scratch Scratch memory previously acquired with call to
+   // ::AcquireQueueMainScratch.
+   virtual void ReleaseQueueMainScratch(ScratchInfo &base) = 0;

-  // @brief Release scratch memory back from alternate to scratch pool.
-  //
-  // @param [in/out] scratch Scratch memory  previously acquired with call to
-  // ::AcquireQueueAltcratch.
-  virtual void ReleaseQueueAltScratch(ScratchInfo& base) = 0;
+   // @brief Release scratch memory back from alternate to scratch pool.
+   //
+   // @param [in/out] scratch Scratch memory  previously acquired with call to
+   // ::AcquireQueueAltcratch.
+   virtual void ReleaseQueueAltScratch(ScratchInfo &base) = 0;

-  // @brief Translate the kernel start and end dispatch timestamp from agent
-  // domain to host domain.
-  //
-  // @param [in] signal Pointer to signal that provides the dispatch timing.
-  // @param [out] time Structure to be populated with the host domain value.
-  virtual void TranslateTime(core::Signal* signal,
-                             hsa_amd_profiling_dispatch_time_t& time) = 0;
+   // @brief Translate the kernel start and end dispatch timestamp from agent
+   // domain to host domain.
+   //
+   // @param [in] signal Pointer to signal that provides the dispatch timing.
+   // @param [out] time Structure to be populated with the host domain value.
+   virtual void TranslateTime(core::Signal *signal,
+                              hsa_amd_profiling_dispatch_time_t &time) = 0;

-  // @brief Translate the async copy start and end timestamp from agent
-  // domain to host domain.
-  //
-  // @param [in] signal Pointer to signal that provides the async copy timing.
-  // @param [out] time Structure to be populated with the host domain value.
-  virtual void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) = 0;
+   // @brief Translate the async copy start and end timestamp from agent
+   // domain to host domain.
+   //
+   // @param [in] signal Pointer to signal that provides the async copy timing.
+   // @param [out] time Structure to be populated with the host domain value.
+   virtual void TranslateTime(core::Signal *signal,
+                              hsa_amd_profiling_async_copy_time_t &time) = 0;

-  // @brief Translate timestamp agent domain to host domain.
-  //
-  // @param [out] time Timestamp in agent domain.
-  virtual uint64_t TranslateTime(uint64_t tick) = 0;
+   // @brief Translate timestamp agent domain to host domain.
+   //
+   // @param [out] time Timestamp in agent domain.
+   virtual uint64_t TranslateTime(uint64_t tick) = 0;

-  // @brief Invalidate caches on the agent which may hold code object data.
-  virtual void InvalidateCodeCaches() = 0;
+   // @brief Invalidate caches on the agent which may hold code object data.
+   virtual void InvalidateCodeCaches() = 0;

-  // @brief Sets the coherency type of this agent.
-  //
-  // @param [in] type New coherency type.
-  //
-  // @retval true The new coherency type is set successfuly.
-  virtual bool current_coherency_type(hsa_amd_coherency_type_t type) = 0;
+   // @brief Sets the coherency type of this agent.
+   //
+   // @param [in] type New coherency type.
+   //
+   // @retval true The new coherency type is set successfuly.
+   virtual bool current_coherency_type(hsa_amd_coherency_type_t type) = 0;

-  // @brief Returns the current coherency type of this agent.
-  //
-  // @retval Coherency type.
-  virtual hsa_amd_coherency_type_t current_coherency_type() const = 0;
+   // @brief Returns the current coherency type of this agent.
+   //
+   // @retval Coherency type.
+   virtual hsa_amd_coherency_type_t current_coherency_type() const = 0;

-  virtual void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) = 0;
+   virtual void RegisterGangPeer(core::Agent &gang_peer,
+                                 unsigned int bandwidth_factor) = 0;

-  virtual void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) = 0;
+   virtual void RegisterRecSdmaEngIdMaskPeer(core::Agent &gang_peer,
+                                             uint32_t rec_sdma_eng_id_mask) = 0;

-  // @brief Query if agent represent Kaveri GPU.
-  //
-  // @retval true if agent is Kaveri GPU.
-  virtual bool is_kv_device() const = 0;
+   // @brief Query if agent represent Kaveri GPU.
+   //
+   // @retval true if agent is Kaveri GPU.
+   virtual bool is_kv_device() const = 0;

-  // @brief Query the agent HSA profile.
-  //
-  // @retval HSA profile.
-  virtual hsa_profile_t profile() const = 0;
+   // @brief Query the agent HSA profile.
+   //
+   // @retval HSA profile.
+   virtual hsa_profile_t profile() const = 0;

-  // @brief Query the agent memory bus width in bit.
-  //
-  // @retval Bus width in bit.
-  virtual uint32_t memory_bus_width() const = 0;
+   // @brief Query the agent memory bus width in bit.
+   //
+   // @retval Bus width in bit.
+   virtual uint32_t memory_bus_width() const = 0;

-  // @brief Query the agent memory maximum frequency in MHz.
-  //
-  // @retval Bus width in MHz.
-  virtual uint32_t memory_max_frequency() const = 0;
+   // @brief Query the agent memory maximum frequency in MHz.
+   //
+   // @retval Bus width in MHz.
+   virtual uint32_t memory_max_frequency() const = 0;

-  // @brief Whether agent supports asynchronous scratch reclaim. Depends on CP FW
-  virtual bool AsyncScratchReclaimEnabled() const = 0;
+   // @brief Whether agent supports asynchronous scratch reclaim. Depends on CP
+   // FW
+   virtual bool AsyncScratchReclaimEnabled() const = 0;

-  // @brief Update the agent's scratch use-once threshold.
-  // Only valid when async scratch reclaim is supported
-  // @retval HSA_STATUS_SUCCESS if successful
-  virtual hsa_status_t SetAsyncScratchThresholds(size_t use_once_limit) = 0;
+   // @brief Update the agent's scratch use-once threshold.
+   // Only valid when async scratch reclaim is supported
+   // @retval HSA_STATUS_SUCCESS if successful
+   virtual hsa_status_t SetAsyncScratchThresholds(size_t use_once_limit) = 0;

-  // @brief Iterate through supported PC Sampling configurations
-  // @retval HSA_STATUS_SUCCESS if successful
-  virtual hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
-                                               void* cb_data) = 0;
+   // @brief Iterate through supported PC Sampling configurations
+   // @retval HSA_STATUS_SUCCESS if successful
+   virtual hsa_status_t
+   PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
+                           void *cb_data) = 0;

-  virtual hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) = 0;
+   virtual hsa_status_t
+   PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession &session) = 0;

-  virtual hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
-                                              pcs::PcsRuntime::PcSamplingSession& session) = 0;
+   virtual hsa_status_t
+   PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
+                          pcs::PcsRuntime::PcSamplingSession &session) = 0;

-  virtual hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) = 0;
+   virtual hsa_status_t
+   PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession &session) = 0;

-  virtual hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) = 0;
+   virtual hsa_status_t
+   PcSamplingStart(pcs::PcsRuntime::PcSamplingSession &session) = 0;

-  virtual hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) = 0;
+   virtual hsa_status_t
+   PcSamplingStop(pcs::PcsRuntime::PcSamplingSession &session) = 0;

-  virtual hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) = 0;
+   virtual hsa_status_t
+   PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession &session) = 0;
 };

 class GpuAgent : public GpuAgentInt {
@@ -43,11 +43,21 @@
 #ifndef HSA_RUNTIME_CORE_INC_AMD_KFD_DRIVER_H_
 #define HSA_RUNTIME_CORE_INC_AMD_KFD_DRIVER_H_

-#include "core/inc/driver.h"
-
 #include <string>

+#include "hsakmt/hsakmt.h"
+
+#include "core/inc/driver.h"
+#include "core/inc/memory_region.h"
+
 namespace rocr {
+
+namespace core {
+
+class Queue;
+
+}
+
 namespace AMD {

 class KfdDriver : public core::Driver {
@@ -57,13 +67,33 @@ public:
  static hsa_status_t DiscoverDriver();

  hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;
-  hsa_status_t GetMemoryProperties(uint32_t node_id,
-                                   core::MemProperties &mprops) const override;
-  hsa_status_t AllocateMemory(void **mem, size_t size, uint32_t node_id,
-                              core::MemFlags flags) override;
-  hsa_status_t FreeMemory(void *mem, uint32_t node_id) override;
+  hsa_status_t
+  GetMemoryProperties(uint32_t node_id,
+                      core::MemoryRegion &mem_region) const override;
+  hsa_status_t AllocateMemory(const core::MemoryRegion &mem_region,
+                              core::MemoryRegion::AllocateFlags alloc_flags,
+                              void **mem, size_t size,
+                              uint32_t node_id) override;
+  hsa_status_t FreeMemory(void *mem, size_t size) override;
  hsa_status_t CreateQueue(core::Queue &queue) override;
  hsa_status_t DestroyQueue(core::Queue &queue) const override;
+
+private:
+  /// @brief Allocate agent accessible memory (system / local memory).
+  static void *AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id,
+                                 size_t size);
+
+  /// @brief Free agent accessible memory (system / local memory).
+  static bool FreeKfdMemory(void *mem, size_t size);
+
+  /// @brief Pin memory.
+  static bool MakeKfdMemoryResident(size_t num_node, const uint32_t *nodes,
+                                    const void *mem, size_t size,
+                                    uint64_t *alternate_va,
+                                    HsaMemMapFlags map_flag);
+
+  /// @brief Unpin memory.
+  static void MakeKfdMemoryUnresident(const void *mem);
 };

 } // namespace AMD
@@ -77,13 +77,6 @@ class MemoryRegion : public core::MemoryRegion {
    return reinterpret_cast<MemoryRegion*>(region.handle);
  }

-  /// @brief Allocate agent accessible memory (system / local memory).
-  static void* AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id,
-                                 size_t size);
-
-  /// @brief Free agent accessible memory (system / local memory).
-  static bool FreeKfdMemory(void* ptr, size_t size);
-
  static bool RegisterMemory(void* ptr, size_t size, const HsaMemFlags& MemFlags);

  static void DeregisterMemory(void* ptr);
@@ -175,7 +168,15 @@ class MemoryRegion : public core::MemoryRegion {

  __forceinline size_t GetPageSize() const { return kPageSize_; }

- private:
+  __forceinline const HsaMemFlags &mem_flags() const { return mem_flag_; }
+  __forceinline const HsaMemMapFlags &map_flags() const { return map_flag_; }
+
+  void *fragment_alloc(size_t size) const {
+    return fragment_allocator_.alloc(size);
+  }
+  bool fragment_free(void *mem) const { return fragment_allocator_.free(mem); }
+
+private:
  const HsaMemoryProperties mem_props_;

  HsaMemFlags mem_flag_;
@@ -45,8 +45,13 @@
 #include <memory>

 #include "core/inc/driver.h"
+#include "core/inc/memory_region.h"

 namespace rocr {
+namespace core {
+class Queue;
+}
+
 namespace AMD {

 class XdnaDriver : public core::Driver {
@@ -57,11 +62,14 @@ public:
  static hsa_status_t DiscoverDriver();
  hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;

-  hsa_status_t GetMemoryProperties(uint32_t node_id,
-                                   core::MemProperties &mprops) const override;
-  hsa_status_t AllocateMemory(void **mem, size_t size, uint32_t node_id,
-                              core::MemFlags flags) override;
-  hsa_status_t FreeMemory(void *mem, uint32_t node_id) override;
+  hsa_status_t
+  GetMemoryProperties(uint32_t node_id,
+                      core::MemoryRegion &mem_region) const override;
+  hsa_status_t AllocateMemory(const core::MemoryRegion &mem_region,
+                              core::MemoryRegion::AllocateFlags alloc_flags,
+                              void **mem, size_t size,
+                              uint32_t node_id) override;
+  hsa_status_t FreeMemory(void *mem, size_t size) override;
  hsa_status_t CreateQueue(core::Queue &queue) override;
  hsa_status_t DestroyQueue(core::Queue &queue) const override;

@@ -46,20 +46,13 @@
 #include <limits>
 #include <string>

-#include "core/inc/agent.h"
 #include "core/inc/memory_region.h"
 #include "inc/hsa.h"

 namespace rocr {
 namespace core {

-using MemFlags = uint32_t;
-
-struct MemProperties {
-  MemFlags flags_;
-  size_t size_bytes_;
-  uint64_t virtual_base_addr_;
-};
+class Queue;

 struct DriverVersionInfo {
  uint32_t major;
@@ -85,17 +78,27 @@ class Driver {
  /// @retval HSA_STATUS_SUCCESS if the kernel-model driver query was
  /// successful.
  virtual hsa_status_t QueryKernelModeDriver(DriverQuery query) = 0;
+
  /// @brief Open a connection to the driver using name_.
  /// @retval HSA_STATUS_SUCCESS if the driver was opened successfully.
  hsa_status_t Open();
+
  /// @brief Close a connection to the open driver using fd_.
  /// @retval HSA_STATUS_SUCCESS if the driver was opened successfully.
  hsa_status_t Close();
+
  /// @brief Get driver version information.
  /// @retval DriverVersionInfo containing the driver's version information.
-  DriverVersionInfo Version() const { return version_; }
+  const DriverVersionInfo &Version() const { return version_; }

-  virtual hsa_status_t GetMemoryProperties(uint32_t node_id, MemProperties &mprops) const = 0;
+  /// @brief Get the memory properties of a specific node.
+  /// @param node_id Node ID of the agent
+  /// @param[in, out] mem_region MemoryRegion object whose properties will be
+  /// retrieved.
+  /// @retval HSA_STATUS_SUCCESS if the driver sucessfully returns the node's
+  ///         memory properties.
+  virtual hsa_status_t GetMemoryProperties(uint32_t node_id,
+                                           MemoryRegion &mem_region) const = 0;

  /// @brief Allocate agent-accessible memory (system or agent-local memory).
  ///
@@ -103,10 +106,12 @@ class Driver {
  ///
  /// @retval HSA_STATUS_SUCCESS if memory was successfully allocated or
  /// hsa_status_t error code if the memory allocation failed.
-  virtual hsa_status_t AllocateMemory(void** mem, size_t size, uint32_t node_id,
-                                      MemFlags flags) = 0;
+  virtual hsa_status_t AllocateMemory(const MemoryRegion &mem_region,
+                                      MemoryRegion::AllocateFlags alloc_flags,
+                                      void **mem, size_t size,
+                                      uint32_t node_id) = 0;

-  virtual hsa_status_t FreeMemory(void* mem, uint32_t node_id) = 0;
+  virtual hsa_status_t FreeMemory(void *mem, size_t size) = 0;

  virtual hsa_status_t CreateQueue(Queue &queue) = 0;

@@ -43,12 +43,14 @@
 #include "core/inc/amd_aie_agent.h"

 #include "core/inc/amd_aie_aql_queue.h"
+#include "core/inc/driver.h"

 namespace rocr {
 namespace AMD {

 AieAgent::AieAgent(uint32_t node)
-    : core::Agent(node, core::Agent::DeviceType::kAmdAieDevice),
+    : core::Agent(core::DriverType::XDNA, node,
+                  core::Agent::DeviceType::kAmdAieDevice),
      max_queues_(core::Runtime::runtime_singleton_->flag().max_queues()) {
  InitRegionList();
 }
@@ -47,14 +47,16 @@
 #include <thread>

 #include "core/inc/amd_memory_region.h"
+#include "core/inc/driver.h"
 #include "core/inc/host_queue.h"

 #include "inc/hsa_ext_image.h"

 namespace rocr {
 namespace AMD {
-CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
-    : core::Agent(node, kAmdCpuDevice), properties_(node_props) {
+CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties &node_props)
+    : core::Agent(core::DriverType::KFD, node, kAmdCpuDevice),
+      properties_(node_props) {
  InitRegionList();

  InitCacheList();
@@ -59,25 +59,6 @@ namespace AMD {
 size_t MemoryRegion::max_sysmem_alloc_size_ = 0;
 size_t MemoryRegion::kPageSize_ = sysconf(_SC_PAGESIZE);

-void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, size_t size) {
-  void* ret = NULL;
-  const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flag, &ret);
-  return (status == HSAKMT_STATUS_SUCCESS) ? ret : NULL;
-}
-
-bool MemoryRegion::FreeKfdMemory(void* ptr, size_t size) {
-  if (ptr == NULL || size == 0) {
-    debug_print("Invalid free ptr:%p size:%lu\n", ptr, size);
-    return true;
-  }
-
-  if (hsaKmtFreeMemory(ptr, size) != HSAKMT_STATUS_SUCCESS) {
-    debug_print("Failed to free ptr:%p size:%lu\n", ptr, size);
-    return false;
-  }
-  return true;
-}
-
 bool MemoryRegion::RegisterMemory(void* ptr, size_t size, const HsaMemFlags& MemFlags) {
  assert(ptr != NULL);
  assert(size != 0);
@@ -198,112 +179,8 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,

  size = AlignUp(size, kPageSize_);

-  HsaMemFlags kmt_alloc_flags(mem_flag_);
-  kmt_alloc_flags.ui32.ExecuteAccess =
-      (alloc_flags & AllocateExecutable ? 1 : 0);
-  kmt_alloc_flags.ui32.AQLQueueMemory =
-      (alloc_flags & AllocateDoubleMap ? 1 : 0);
-  if (IsSystem() && (alloc_flags & AllocateNonPaged))
-      kmt_alloc_flags.ui32.NonPaged = 1;
-
-  // Allocating a memory handle for virtual memory
-  kmt_alloc_flags.ui32.NoAddress = !!(alloc_flags & AllocateMemoryOnly);
-
-  // Allocate pseudo fine grain memory
-  kmt_alloc_flags.ui32.CoarseGrain = (alloc_flags & AllocatePCIeRW ? 0 : kmt_alloc_flags.ui32.CoarseGrain);
-  kmt_alloc_flags.ui32.NoSubstitute = (alloc_flags & AllocatePinned ? 1 : kmt_alloc_flags.ui32.NoSubstitute);
-
-  kmt_alloc_flags.ui32.GTTAccess = (alloc_flags & AllocateGTTAccess ? 1 : kmt_alloc_flags.ui32.GTTAccess);
-  if (IsLocalMemory()) {
-    // Allocate physically contiguous memory - AllocateKfdMemory function call will fail
-    // if this flag is not supported in KFD.
-    kmt_alloc_flags.ui32.Contiguous =
-        (alloc_flags & AllocateContiguous ? 1 : kmt_alloc_flags.ui32.Contiguous);
-  }
-
-  // Only allow using the suballocator for ordinary VRAM.
-  if (IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
-    bool subAllocEnabled = !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc();
-    // Avoid modifying executable or queue allocations.
-    bool useSubAlloc = subAllocEnabled;
-    useSubAlloc &= ((alloc_flags & (~AllocateRestrict)) == 0);
-    if (useSubAlloc) {
-      *address = fragment_allocator_.alloc(size);
-
-      if ((alloc_flags & AllocateAsan) &&
-          hsaKmtReplaceAsanHeaderPage(*address) != HSAKMT_STATUS_SUCCESS) {
-        fragment_allocator_.free(*address);
-        *address = NULL;
-        return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-      }
-      return HSA_STATUS_SUCCESS;
-    }
-  }
-
-  const HSAuint32 node_id = (alloc_flags & AllocateGTTAccess) ? agent_node_id : owner()->node_id();
-
-  // Allocate memory.
-  // If it fails attempt to release memory from the block allocator and retry.
-  *address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
-  if (*address == nullptr) {
-    owner()->Trim();
-    *address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
-  }
-
-  if (*address != nullptr) {
-    if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS;
-
-    // Commit the memory.
-    // For system memory, on non-restricted allocation, map it to all GPUs. On
-    // restricted allocation, only CPU is allowed to access by default, so
-    // no need to map
-    // For local memory, only map it to the owning GPU. Mapping to other GPU,
-    // if the access is allowed, is performed on AllowAccess.
-    HsaMemMapFlags map_flag = map_flag_;
-    size_t map_node_count = 1;
-    const uint32_t owner_node_id = owner()->node_id();
-    const uint32_t* map_node_id = &owner_node_id;
-
-    if (IsSystem()) {
-      if ((alloc_flags & AllocateRestrict) == 0) {
-        // Map to all GPU agents.
-        map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size();
-
-        if (map_node_count == 0) {
-          // No need to pin since no GPU in the platform.
-          return HSA_STATUS_SUCCESS;
-        }
-
-        map_node_id = &core::Runtime::runtime_singleton_->gpu_ids()[0];
-      } else {
-        // No need to pin it for CPU exclusive access.
-        return HSA_STATUS_SUCCESS;
-      }
-    }
-
-    uint64_t alternate_va = 0;
-    const bool is_resident = MakeKfdMemoryResident(
-        map_node_count, map_node_id, *address, size, &alternate_va, map_flag);
-
-    const bool require_pinning =
-        (!full_profile() || IsLocalMemory() || IsScratch());
-
-    if (require_pinning && !is_resident) {
-      FreeKfdMemory(*address, size);
-      *address = NULL;
-      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-    }
-
-    if ((alloc_flags & AllocateAsan) &&
-        hsaKmtReplaceAsanHeaderPage(*address) != HSAKMT_STATUS_SUCCESS) {
-      FreeKfdMemory(*address, size);
-      *address = NULL;
-      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-    }
-    return HSA_STATUS_SUCCESS;
-  }
-
-  return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  return core::Runtime::runtime_singleton_->AgentDriver(owner()->driver_type)
+      .AllocateMemory(*this, alloc_flags, address, size, agent_node_id);
 }

 hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
@@ -314,9 +191,8 @@ hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
 hsa_status_t MemoryRegion::FreeImpl(void* address, size_t size) const {
  if (fragment_allocator_.free(address)) return HSA_STATUS_SUCCESS;

-  MakeKfdMemoryUnresident(address);
-
-  return FreeKfdMemory(address, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
+  return core::Runtime::runtime_singleton_->AgentDriver(owner()->driver_type)
+      .FreeMemory(address, size);
 }

 // TODO:  Look into a better name and/or making this process transparent to exporting.