From 1351cd7fa802efde2d4d2e5e6a904ba863c60532 Mon Sep 17 00:00:00 2001
From: "Sang, Tao" <Tao.Sang@amd.com>
Date: Fri, 11 Jul 2025 10:38:40 -0400
Subject: [PATCH] SWDEV-539145 - Simplify host memory pool management (#668)

* SWDEV-539145 - Simplify host memory pool management

Remove unnecessary variables and functions.
Make code simpler and clear.

* Change cpu_agent_info_ into pointer.

* Restore getPreferredNumaNode()
---
 rocclr/device/rocm/rocdevice.cpp | 107 ++++++++++++++++---------------
 rocclr/device/rocm/rocdevice.hpp |  27 ++++----
 rocclr/device/rocm/rocmemory.cpp |  34 ++--------
 rocclr/device/rocm/rocmemory.hpp |   8 +++
 4 files changed, 77 insertions(+), 99 deletions(-)

diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
index 7ada04d7ea..02b2cc9e04 100644
--- a/rocclr/device/rocm/rocdevice.cpp
+++ b/rocclr/device/rocm/rocdevice.cpp
@@ -191,11 +191,8 @@ Device::Device(hsa_agent_t bkendDevice)
     , numOfVgpus_(0)
     , preferred_numa_node_(0)
     , maxSdmaReadMask_(0)
-    , maxSdmaWriteMask_(0) {
+    , maxSdmaWriteMask_(0), cpu_agent_info_(nullptr) {
   group_segment_.handle = 0;
-  system_segment_.handle = 0;
-  system_coarse_segment_.handle = 0;
-  system_kernarg_segment_.handle = 0;
   gpuvm_segment_.handle = 0;
   gpu_fine_grained_segment_.handle = 0;
   gpu_ext_fine_grained_segment_.handle = 0;
@@ -225,20 +222,20 @@ void Device::setupCpuAgent() {
   }
 
   preferred_numa_node_ = index;
-  cpu_agent_ = cpu_agents_[index].agent;
-  system_segment_ = cpu_agents_[index].fine_grain_pool;
-  system_coarse_segment_ = cpu_agents_[index].coarse_grain_pool;
-  system_kernarg_segment_ = cpu_agents_[index].kern_arg_pool;
-  system_ext_segment_ = cpu_agents_[index].ext_fine_grain_pool;
+  cpu_agent_info_ = &cpu_agents_[index];
+
   ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Numa selects cpu agent[%zu]=0x%zx(fine=0x%zx,"
-          "coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d", index, cpu_agent_.handle,
-          system_segment_.handle, system_coarse_segment_.handle, bkendDevice_.handle, isXgmi_);
+          "coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d", index,
+          cpu_agent_info_->agent.handle,
+          cpu_agent_info_->fine_grain_pool.handle,
+          cpu_agent_info_->coarse_grain_pool.handle,
+          bkendDevice_.handle, isXgmi_);
 }
 
 void Device::checkAtomicSupport() {
   std::vector<amd::Device::LinkAttrType> link_attrs;
   link_attrs.push_back(std::make_pair(LinkAttribute::kLinkAtomicSupport, 0));
-  if (findLinkInfo(system_segment_, &link_attrs)) {
+  if (findLinkInfo(cpu_agent_info_->fine_grain_pool, &link_attrs)) {
     if (link_attrs[0].second == 1) {
       info_.pcie_atomics_ = true;
     }
@@ -863,7 +860,7 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
           // If cpu agent cannot access this pool, the device does not support large bar.
           hsa_amd_memory_pool_access_t tmp{};
           hsa_amd_agent_memory_pool_get_info(
-            dev->cpu_agent_,
+            dev->cpu_agent_info_->agent,
             pool,
             HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
             &tmp);
@@ -1166,7 +1163,7 @@ bool Device::populateOCLDeviceConstants() {
 
   checkAtomicSupport();
 
-  assert(system_segment_.handle != 0);
+  assert(cpu_agent_info_->fine_grain_pool.handle != 0);
   if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
                                 bkendDevice_, Device::iterateGpuMemoryPoolCallback, this)) {
     return false;
@@ -1286,7 +1283,8 @@ bool Device::populateOCLDeviceConstants() {
 
     if (HSA_STATUS_SUCCESS !=
         hsa_amd_memory_pool_get_info(
-            system_segment_, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &alloc_granularity_)) {
+            cpu_agent_info_->fine_grain_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
+            &alloc_granularity_)) {
       return false;
     }
   }
@@ -2005,46 +2003,54 @@ device::Memory* Device::createMemory(size_t size, size_t alignment) const {
 }
 
 // ================================================================================================
-void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
-  void* ptr = nullptr;
-
+hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg,
+                                                const AgentInfo* agentInfo) const {
+  if (agentInfo == nullptr) {
+    agentInfo = cpu_agent_info_;
+  }
   hsa_amd_memory_pool_t segment{0};
   switch (mem_seg) {
     case kKernArg : {
       if (settings().fgs_kernel_arg_) {
-        segment = system_kernarg_segment_;
+        segment = agentInfo->kern_arg_pool;
         break;
       }
       // Falls through on else case.
     }
     case kNoAtomics :
       // If runtime disables barrier, then all host allocations must have L2 disabled
-      if (system_coarse_segment_.handle != 0) {
-        segment = system_coarse_segment_;
+      if (agentInfo->coarse_grain_pool.handle != 0) {
+        segment = agentInfo->coarse_grain_pool;
         break;
       }
       // Falls through on else case.
     case kAtomics :
-      segment = system_segment_;
+      segment = agentInfo->fine_grain_pool;
       break;
     case kUncachedAtomics :
-      if (system_ext_segment_.handle != 0) {
+      if (agentInfo->ext_fine_grain_pool.handle != 0) {
         ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
                   "Using extended fine grained access system memory pool");
-        segment = system_ext_segment_;
+        segment = agentInfo->ext_fine_grain_pool;
       } else {
         ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
                   "Falling through on fine grained access system memory pool");
-        segment = system_segment_;
+        segment = agentInfo->fine_grain_pool;
       }
       break;
     default :
       guarantee(false, "Invalid Memory Segment");
       break;
   }
-
   assert(segment.handle != 0);
-  hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
+  return segment;
+}
+
+// ================================================================================================
+void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
+  void* ptr = nullptr;
+  hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg);
+  hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
   ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx,"
      " numa_node = %d, mem_seg = %d", ptr, size, preferred_numa_node_, static_cast<int>(mem_seg));
   if (stat != HSA_STATUS_SUCCESS) {
@@ -2065,28 +2071,8 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) co
 // ================================================================================================
 void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const {
   void* ptr = nullptr;
-  hsa_amd_memory_pool_t segment = agentInfo.fine_grain_pool;
-  switch (mem_seg) {
-    case kNoAtomics :
-      if (agentInfo.coarse_grain_pool.handle != 0) {
-        segment = agentInfo.coarse_grain_pool;
-      }
-      break;
-    case kUncachedAtomics :
-      if (agentInfo.ext_fine_grain_pool.handle != 0) {
-        ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
-                  "Using extended fine grained access system memory pool in hostAgentAlloc");
-        segment = agentInfo.ext_fine_grain_pool;
-      } else {
-        ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
-                  "Falling through on fine grained access system memory pool in hostAgentAlloc");
-      }
-      break;
-    default :
-      break;
-  }
-  assert(segment.handle != 0);
-  hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
+  hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, &agentInfo);
+  hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
   ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
   if (stat != HSA_STATUS_SUCCESS) {
     LogPrintfError("Fail allocation host memory with err %d", stat);
@@ -2144,6 +2130,21 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
   return ptr;
 }
 
+void* Device::hostLock(void* hostMem, size_t size, const MemorySegment memSegment) const {
+  hsa_amd_memory_pool_t pool = getHostMemoryPool(memSegment);
+  void *deviceMemory = nullptr;
+  hsa_status_t status = hsa_amd_memory_lock_to_pool(hostMem, size,
+      const_cast<hsa_agent_t*>(&bkendDevice_), 1, pool, 0, &deviceMemory);
+  ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Locking to pool %p, size 0x%zx, hostMem = %p,"
+          " deviceMemory = %p, memSegment = %d", pool, size, hostMem, deviceMemory,
+          static_cast<int>(memSegment));
+  if (status != HSA_STATUS_SUCCESS) {
+    DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status);
+    deviceMemory = nullptr;
+  }
+  return deviceMemory;
+}
+
 void Device::hostFree(void* ptr, size_t size) const { memFree(ptr, size); }
 
 bool Device::deviceAllowAccess(void* ptr) const {
@@ -2585,11 +2586,11 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
         case amd::MemRangeAttribute::AccessedBy:
           accessed_by = attr.size();
           // Add all GPU devices into the query
-          for (const auto agent : getGpuAgents()) {
+          for (const auto agent : gpu_agents_) {
             attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent.handle});
           }
           // Add CPU devices
-          for (const auto agent_info : getCpuAgents()) {
+          for (const auto agent_info : cpu_agents_) {
             attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent_info.agent.handle});
           }
           accessed_by = attr.size() - accessed_by;
@@ -2643,7 +2644,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
             }
           }
           // Find CPU agent returned by ROCr
-          for (auto& agent_info : getCpuAgents()) {
+          for (auto& agent_info : cpu_agents_) {
             if (agent_info.agent.handle == it.value) {
               *reinterpret_cast<int32_t*>(data[idx]) = static_cast<int32_t>(amd::CpuDeviceId);
             }
@@ -2678,7 +2679,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
                   }
                 }
                 // Find CPU agent returned by ROCr
-                for (auto& agent_info : getCpuAgents()) {
+                for (auto& agent_info : cpu_agents_) {
                   if (agent_info.agent.handle == it.value) {
                     reinterpret_cast<int32_t*>(data[idx])[entry] =
                       static_cast<int32_t>(amd::CpuDeviceId);
diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp
index 140431ec1b..85b10edb6e 100644
--- a/rocclr/device/rocm/rocdevice.hpp
+++ b/rocclr/device/rocm/rocdevice.hpp
@@ -341,10 +341,8 @@ class Device : public NullDevice {
   static bool loadHsaModules();
 
   hsa_agent_t getBackendDevice() const { return bkendDevice_; }
-  const hsa_agent_t &getCpuAgent() const { return cpu_agent_; } // Get the CPU agent with the least NUMA distance to this GPU
-
-  static const std::vector<hsa_agent_t>& getGpuAgents() { return gpu_agents_; }
-  static const std::vector<AgentInfo>& getCpuAgents() { return cpu_agents_; }
+  //! Get the CPU agent with the least NUMA distance to this GPU
+  const hsa_agent_t &getCpuAgent() const { return cpu_agent_info_->agent; }
 
   void setupCpuAgent(); // Setup the CPU agent which has the least NUMA distance to this GPU
 
@@ -408,7 +406,6 @@ class Device : public NullDevice {
   virtual bool globalFreeMemory(size_t* freeMemory) const;
   virtual void* hostAlloc(size_t size, size_t alignment,
                           MemorySegment mem_seg = MemorySegment::kNoAtomics) const;
-
   virtual void hostFree(void* ptr, size_t size = 0) const;
 
   bool deviceAllowAccess(void* dst) const;
@@ -459,6 +456,10 @@ class Device : public NullDevice {
   //! Allocate host memory from agent info
   void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const;
 
+  //! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
+  //! return a new device pointer accessible by the GPU agent.
+  void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const;
+
   //! Returns transfer engine object
   const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); }
 
@@ -501,10 +502,6 @@ class Device : public NullDevice {
 
   VirtualGPU* xferQueue() const;
 
-  hsa_amd_memory_pool_t SystemSegment() const { return system_segment_; }
-  hsa_amd_memory_pool_t SystemExtSegment() const { return system_ext_segment_; }
-  hsa_amd_memory_pool_t SystemCoarseSegment() const { return system_coarse_segment_; }
-
   //! Acquire HSA queue. This method can create a new HSA queue or
   //! share previously created
   hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false,
@@ -547,6 +544,7 @@ class Device : public NullDevice {
   virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0);
 
   const uint32_t getPreferredNumaNode() const { return preferred_numa_node_; }
+
   const bool isFineGrainSupported() const;
 
   //! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
@@ -588,8 +586,6 @@ class Device : public NullDevice {
   static bool isHsaInitialized_;
   static std::vector<hsa_agent_t> gpu_agents_;
   static std::vector<AgentInfo> cpu_agents_;
-
-  hsa_agent_t cpu_agent_;
   uint32_t preferred_numa_node_;
   std::vector<hsa_agent_t> p2p_agents_;  //!< List of P2P agents available for this device
   mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls
@@ -598,10 +594,8 @@ class Device : public NullDevice {
   hsa_agent_t* p2p_agents_list_ = nullptr;
   hsa_profile_t agent_profile_;
   hsa_amd_memory_pool_t group_segment_;
-  hsa_amd_memory_pool_t system_segment_;
-  hsa_amd_memory_pool_t system_coarse_segment_;
-  hsa_amd_memory_pool_t system_kernarg_segment_;
-  hsa_amd_memory_pool_t system_ext_segment_;
+
+  AgentInfo *cpu_agent_info_;
 
   hsa_amd_memory_pool_t gpuvm_segment_;
   hsa_amd_memory_pool_t gpu_fine_grained_segment_;
@@ -649,7 +643,8 @@ class Device : public NullDevice {
 
   //! Pool of HSA queues with custom CU masks
   std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queueWithCUMaskPool_;
-
+  hsa_amd_memory_pool_t getHostMemoryPool(MemorySegment mem_seg,
+                                          const AgentInfo* agentInfo = nullptr) const;
   //! Read and Write mask for device<->host
   uint32_t maxSdmaReadMask_;
   uint32_t maxSdmaWriteMask_;
diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp
index 50a981ec5e..6308e28e4e 100644
--- a/rocclr/device/rocm/rocmemory.cpp
+++ b/rocclr/device/rocm/rocmemory.cpp
@@ -825,10 +825,7 @@ bool Buffer::create(bool alloc_local) {
             deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
           }
         } else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) {
-          deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) == 0
-                      ? Device::MemorySegment::kNoAtomics :
-                      ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
-                      Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics));
+          deviceMemory_ = dev().hostNumaAlloc(size(), 1, getHostMemorySegment(memFlags));
         } else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) {
           // TODO: ROCr will introduce a new attribute enum that implies a non-blocking signal,
           // replace "HSA_AMD_SIGNAL_AMD_GPU_ONLY" with this new enum when it is ready.
@@ -852,10 +849,7 @@ bool Buffer::create(bool alloc_local) {
           // Disable host access to force blit path for memeory writes.
           flags_ &= ~HostMemoryDirectAccess;
         } else {
-          deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) == 0
-                 ? Device::MemorySegment::kNoAtomics :
-                 ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
-                 Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics));
+          deviceMemory_ = dev().hostAlloc(size(), 1, getHostMemorySegment(memFlags));
         }
       } else {
         assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
@@ -1012,28 +1006,8 @@ bool Buffer::create(bool alloc_local) {
     owner()->setHostMem(deviceMemory_);
   } else if (owner()->getSvmPtr() != owner()->getHostMem()) {
     if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
-      hsa_amd_memory_pool_t pool = dev().SystemSegment(); // Default
-      if ((memFlags & CL_MEM_SVM_ATOMICS) == 0) {
-        if (dev().SystemCoarseSegment().handle != 0) {
-          pool = dev().SystemCoarseSegment();
-        }
-      } else if ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0) {
-        if (dev().SystemExtSegment().handle != 0) {
-          pool = dev().SystemExtSegment();
-          ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
-                  "Using extended fine grained access system memory pool to lock");
-        }
-      }
-      hsa_agent_t hsa_agent = dev().getBackendDevice();
-      hsa_status_t status = hsa_amd_memory_lock_to_pool(owner()->getHostMem(),
-          owner()->getSize(), &hsa_agent, 1, pool, 0, &deviceMemory_);
-      ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Locking to pool %p, size 0x%zx, HostPtr = %p,"
-              " DevPtr = %p, memFlags = 0x%xh", pool, owner()->getSize(),
-              owner()->getHostMem(), deviceMemory_, memFlags);
-      if (status != HSA_STATUS_SUCCESS) {
-        DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status);
-        deviceMemory_ = nullptr;
-      }
+      deviceMemory_ = dev().hostLock(owner()->getHostMem(), owner()->getSize(),
+                                     getHostMemorySegment(memFlags));
     } else {
       deviceMemory_ = owner()->getHostMem();
     }
diff --git a/rocclr/device/rocm/rocmemory.hpp b/rocclr/device/rocm/rocmemory.hpp
index 262be44457..746ee4fc00 100644
--- a/rocclr/device/rocm/rocmemory.hpp
+++ b/rocclr/device/rocm/rocmemory.hpp
@@ -151,6 +151,14 @@ class Memory : public device::Memory {
 
   void* persistent_host_ptr_;  //!< Host accessible pointer for persistent memory
 
+  // Get MemorySegment type in terms of host memory allocation flags
+  Device::MemorySegment getHostMemorySegment(const unsigned int memFlags) {
+    return (memFlags & CL_MEM_SVM_ATOMICS) == 0
+           ? Device::MemorySegment::kNoAtomics :
+           ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
+             Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics);
+  }
+
  private:
   // Disable copy constructor
   Memory(const Memory&);