SWDEV-539145 - Simplify host memory pool management (#668)

* SWDEV-539145 - Simplify host memory pool management Remove unnecessary variables and functions. Make code simpler and clear. * Change cpu_agent_info_ into pointer. * Restore getPreferredNumaNode()
2025-07-11 10:38:40 -04:00
@@ -191,11 +191,8 @@ Device::Device(hsa_agent_t bkendDevice)
    , numOfVgpus_(0)
    , preferred_numa_node_(0)
    , maxSdmaReadMask_(0)
-    , maxSdmaWriteMask_(0) {
+    , maxSdmaWriteMask_(0), cpu_agent_info_(nullptr) {
  group_segment_.handle = 0;
-  system_segment_.handle = 0;
-  system_coarse_segment_.handle = 0;
-  system_kernarg_segment_.handle = 0;
  gpuvm_segment_.handle = 0;
  gpu_fine_grained_segment_.handle = 0;
  gpu_ext_fine_grained_segment_.handle = 0;
@@ -225,20 +222,20 @@ void Device::setupCpuAgent() {
  }

  preferred_numa_node_ = index;
-  cpu_agent_ = cpu_agents_[index].agent;
-  system_segment_ = cpu_agents_[index].fine_grain_pool;
-  system_coarse_segment_ = cpu_agents_[index].coarse_grain_pool;
-  system_kernarg_segment_ = cpu_agents_[index].kern_arg_pool;
-  system_ext_segment_ = cpu_agents_[index].ext_fine_grain_pool;
+  cpu_agent_info_ = &cpu_agents_[index];
+
  ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Numa selects cpu agent[%zu]=0x%zx(fine=0x%zx,"
-          "coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d", index, cpu_agent_.handle,
-          system_segment_.handle, system_coarse_segment_.handle, bkendDevice_.handle, isXgmi_);
+          "coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d", index,
+          cpu_agent_info_->agent.handle,
+          cpu_agent_info_->fine_grain_pool.handle,
+          cpu_agent_info_->coarse_grain_pool.handle,
+          bkendDevice_.handle, isXgmi_);
 }

 void Device::checkAtomicSupport() {
  std::vector<amd::Device::LinkAttrType> link_attrs;
  link_attrs.push_back(std::make_pair(LinkAttribute::kLinkAtomicSupport, 0));
-  if (findLinkInfo(system_segment_, &link_attrs)) {
+  if (findLinkInfo(cpu_agent_info_->fine_grain_pool, &link_attrs)) {
    if (link_attrs[0].second == 1) {
      info_.pcie_atomics_ = true;
    }
@@ -863,7 +860,7 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
          // If cpu agent cannot access this pool, the device does not support large bar.
          hsa_amd_memory_pool_access_t tmp{};
          hsa_amd_agent_memory_pool_get_info(
-            dev->cpu_agent_,
+            dev->cpu_agent_info_->agent,
            pool,
            HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
            &tmp);
@@ -1166,7 +1163,7 @@ bool Device::populateOCLDeviceConstants() {

  checkAtomicSupport();

-  assert(system_segment_.handle != 0);
+  assert(cpu_agent_info_->fine_grain_pool.handle != 0);
  if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
                                bkendDevice_, Device::iterateGpuMemoryPoolCallback, this)) {
    return false;
@@ -1286,7 +1283,8 @@ bool Device::populateOCLDeviceConstants() {

    if (HSA_STATUS_SUCCESS !=
        hsa_amd_memory_pool_get_info(
-            system_segment_, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &alloc_granularity_)) {
+            cpu_agent_info_->fine_grain_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
+            &alloc_granularity_)) {
      return false;
    }
  }
@@ -2005,46 +2003,54 @@ device::Memory* Device::createMemory(size_t size, size_t alignment) const {
 }

 // ================================================================================================
-void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
-  void* ptr = nullptr;
-
+hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg,
+                                                const AgentInfo* agentInfo) const {
+  if (agentInfo == nullptr) {
+    agentInfo = cpu_agent_info_;
+  }
  hsa_amd_memory_pool_t segment{0};
  switch (mem_seg) {
    case kKernArg : {
      if (settings().fgs_kernel_arg_) {
-        segment = system_kernarg_segment_;
+        segment = agentInfo->kern_arg_pool;
        break;
      }
      // Falls through on else case.
    }
    case kNoAtomics :
      // If runtime disables barrier, then all host allocations must have L2 disabled
-      if (system_coarse_segment_.handle != 0) {
-        segment = system_coarse_segment_;
+      if (agentInfo->coarse_grain_pool.handle != 0) {
+        segment = agentInfo->coarse_grain_pool;
        break;
      }
      // Falls through on else case.
    case kAtomics :
-      segment = system_segment_;
+      segment = agentInfo->fine_grain_pool;
      break;
    case kUncachedAtomics :
-      if (system_ext_segment_.handle != 0) {
+      if (agentInfo->ext_fine_grain_pool.handle != 0) {
        ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
                  "Using extended fine grained access system memory pool");
-        segment = system_ext_segment_;
+        segment = agentInfo->ext_fine_grain_pool;
      } else {
        ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
                  "Falling through on fine grained access system memory pool");
-        segment = system_segment_;
+        segment = agentInfo->fine_grain_pool;
      }
      break;
    default :
      guarantee(false, "Invalid Memory Segment");
      break;
  }
-
  assert(segment.handle != 0);
-  hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
+  return segment;
+}
+
+// ================================================================================================
+void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
+  void* ptr = nullptr;
+  hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg);
+  hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
  ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx,"
     " numa_node = %d, mem_seg = %d", ptr, size, preferred_numa_node_, static_cast<int>(mem_seg));
  if (stat != HSA_STATUS_SUCCESS) {
@@ -2065,28 +2071,8 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) co
 // ================================================================================================
 void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const {
  void* ptr = nullptr;
-  hsa_amd_memory_pool_t segment = agentInfo.fine_grain_pool;
-  switch (mem_seg) {
-    case kNoAtomics :
-      if (agentInfo.coarse_grain_pool.handle != 0) {
-        segment = agentInfo.coarse_grain_pool;
-      }
-      break;
-    case kUncachedAtomics :
-      if (agentInfo.ext_fine_grain_pool.handle != 0) {
-        ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
-                  "Using extended fine grained access system memory pool in hostAgentAlloc");
-        segment = agentInfo.ext_fine_grain_pool;
-      } else {
-        ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
-                  "Falling through on fine grained access system memory pool in hostAgentAlloc");
-      }
-      break;
-    default :
-      break;
-  }
-  assert(segment.handle != 0);
-  hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
+  hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, &agentInfo);
+  hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
  ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
  if (stat != HSA_STATUS_SUCCESS) {
    LogPrintfError("Fail allocation host memory with err %d", stat);
@@ -2144,6 +2130,21 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
  return ptr;
 }

+void* Device::hostLock(void* hostMem, size_t size, const MemorySegment memSegment) const {
+  hsa_amd_memory_pool_t pool = getHostMemoryPool(memSegment);
+  void *deviceMemory = nullptr;
+  hsa_status_t status = hsa_amd_memory_lock_to_pool(hostMem, size,
+      const_cast<hsa_agent_t*>(&bkendDevice_), 1, pool, 0, &deviceMemory);
+  ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Locking to pool %p, size 0x%zx, hostMem = %p,"
+          " deviceMemory = %p, memSegment = %d", pool, size, hostMem, deviceMemory,
+          static_cast<int>(memSegment));
+  if (status != HSA_STATUS_SUCCESS) {
+    DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status);
+    deviceMemory = nullptr;
+  }
+  return deviceMemory;
+}
+
 void Device::hostFree(void* ptr, size_t size) const { memFree(ptr, size); }

 bool Device::deviceAllowAccess(void* ptr) const {
@@ -2585,11 +2586,11 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
        case amd::MemRangeAttribute::AccessedBy:
          accessed_by = attr.size();
          // Add all GPU devices into the query
-          for (const auto agent : getGpuAgents()) {
+          for (const auto agent : gpu_agents_) {
            attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent.handle});
          }
          // Add CPU devices
-          for (const auto agent_info : getCpuAgents()) {
+          for (const auto agent_info : cpu_agents_) {
            attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent_info.agent.handle});
          }
          accessed_by = attr.size() - accessed_by;
@@ -2643,7 +2644,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
            }
          }
          // Find CPU agent returned by ROCr
-          for (auto& agent_info : getCpuAgents()) {
+          for (auto& agent_info : cpu_agents_) {
            if (agent_info.agent.handle == it.value) {
              *reinterpret_cast<int32_t*>(data[idx]) = static_cast<int32_t>(amd::CpuDeviceId);
            }
@@ -2678,7 +2679,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
                  }
                }
                // Find CPU agent returned by ROCr
-                for (auto& agent_info : getCpuAgents()) {
+                for (auto& agent_info : cpu_agents_) {
                  if (agent_info.agent.handle == it.value) {
                    reinterpret_cast<int32_t*>(data[idx])[entry] =
                      static_cast<int32_t>(amd::CpuDeviceId);
@@ -341,10 +341,8 @@ class Device : public NullDevice {
  static bool loadHsaModules();

  hsa_agent_t getBackendDevice() const { return bkendDevice_; }
-  const hsa_agent_t &getCpuAgent() const { return cpu_agent_; } // Get the CPU agent with the least NUMA distance to this GPU
-
-  static const std::vector<hsa_agent_t>& getGpuAgents() { return gpu_agents_; }
-  static const std::vector<AgentInfo>& getCpuAgents() { return cpu_agents_; }
+  //! Get the CPU agent with the least NUMA distance to this GPU
+  const hsa_agent_t &getCpuAgent() const { return cpu_agent_info_->agent; }

  void setupCpuAgent(); // Setup the CPU agent which has the least NUMA distance to this GPU

@@ -408,7 +406,6 @@ class Device : public NullDevice {
  virtual bool globalFreeMemory(size_t* freeMemory) const;
  virtual void* hostAlloc(size_t size, size_t alignment,
                          MemorySegment mem_seg = MemorySegment::kNoAtomics) const;
-
  virtual void hostFree(void* ptr, size_t size = 0) const;

  bool deviceAllowAccess(void* dst) const;
@@ -459,6 +456,10 @@ class Device : public NullDevice {
  //! Allocate host memory from agent info
  void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const;

+  //! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
+  //! return a new device pointer accessible by the GPU agent.
+  void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const;
+
  //! Returns transfer engine object
  const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); }

@@ -501,10 +502,6 @@ class Device : public NullDevice {

  VirtualGPU* xferQueue() const;

-  hsa_amd_memory_pool_t SystemSegment() const { return system_segment_; }
-  hsa_amd_memory_pool_t SystemExtSegment() const { return system_ext_segment_; }
-  hsa_amd_memory_pool_t SystemCoarseSegment() const { return system_coarse_segment_; }
-
  //! Acquire HSA queue. This method can create a new HSA queue or
  //! share previously created
  hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false,
@@ -547,6 +544,7 @@ class Device : public NullDevice {
  virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0);

  const uint32_t getPreferredNumaNode() const { return preferred_numa_node_; }
+
  const bool isFineGrainSupported() const;

  //! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
@@ -588,8 +586,6 @@ class Device : public NullDevice {
  static bool isHsaInitialized_;
  static std::vector<hsa_agent_t> gpu_agents_;
  static std::vector<AgentInfo> cpu_agents_;
-
-  hsa_agent_t cpu_agent_;
  uint32_t preferred_numa_node_;
  std::vector<hsa_agent_t> p2p_agents_;  //!< List of P2P agents available for this device
  mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls
@@ -598,10 +594,8 @@ class Device : public NullDevice {
  hsa_agent_t* p2p_agents_list_ = nullptr;
  hsa_profile_t agent_profile_;
  hsa_amd_memory_pool_t group_segment_;
-  hsa_amd_memory_pool_t system_segment_;
-  hsa_amd_memory_pool_t system_coarse_segment_;
-  hsa_amd_memory_pool_t system_kernarg_segment_;
-  hsa_amd_memory_pool_t system_ext_segment_;
+
+  AgentInfo *cpu_agent_info_;

  hsa_amd_memory_pool_t gpuvm_segment_;
  hsa_amd_memory_pool_t gpu_fine_grained_segment_;
@@ -649,7 +643,8 @@ class Device : public NullDevice {

  //! Pool of HSA queues with custom CU masks
  std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queueWithCUMaskPool_;
-
+  hsa_amd_memory_pool_t getHostMemoryPool(MemorySegment mem_seg,
+                                          const AgentInfo* agentInfo = nullptr) const;
  //! Read and Write mask for device<->host
  uint32_t maxSdmaReadMask_;
  uint32_t maxSdmaWriteMask_;
@@ -825,10 +825,7 @@ bool Buffer::create(bool alloc_local) {
            deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
          }
        } else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) {
-          deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) == 0
-                      ? Device::MemorySegment::kNoAtomics :
-                      ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
-                      Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics));
+          deviceMemory_ = dev().hostNumaAlloc(size(), 1, getHostMemorySegment(memFlags));
        } else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) {
          // TODO: ROCr will introduce a new attribute enum that implies a non-blocking signal,
          // replace "HSA_AMD_SIGNAL_AMD_GPU_ONLY" with this new enum when it is ready.
@@ -852,10 +849,7 @@ bool Buffer::create(bool alloc_local) {
          // Disable host access to force blit path for memeory writes.
          flags_ &= ~HostMemoryDirectAccess;
        } else {
-          deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) == 0
-                 ? Device::MemorySegment::kNoAtomics :
-                 ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
-                 Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics));
+          deviceMemory_ = dev().hostAlloc(size(), 1, getHostMemorySegment(memFlags));
        }
      } else {
        assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
@@ -1012,28 +1006,8 @@ bool Buffer::create(bool alloc_local) {
    owner()->setHostMem(deviceMemory_);
  } else if (owner()->getSvmPtr() != owner()->getHostMem()) {
    if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
-      hsa_amd_memory_pool_t pool = dev().SystemSegment(); // Default
-      if ((memFlags & CL_MEM_SVM_ATOMICS) == 0) {
-        if (dev().SystemCoarseSegment().handle != 0) {
-          pool = dev().SystemCoarseSegment();
-        }
-      } else if ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0) {
-        if (dev().SystemExtSegment().handle != 0) {
-          pool = dev().SystemExtSegment();
-          ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
-                  "Using extended fine grained access system memory pool to lock");
-        }
-      }
-      hsa_agent_t hsa_agent = dev().getBackendDevice();
-      hsa_status_t status = hsa_amd_memory_lock_to_pool(owner()->getHostMem(),
-          owner()->getSize(), &hsa_agent, 1, pool, 0, &deviceMemory_);
-      ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Locking to pool %p, size 0x%zx, HostPtr = %p,"
-              " DevPtr = %p, memFlags = 0x%xh", pool, owner()->getSize(),
-              owner()->getHostMem(), deviceMemory_, memFlags);
-      if (status != HSA_STATUS_SUCCESS) {
-        DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status);
-        deviceMemory_ = nullptr;
-      }
+      deviceMemory_ = dev().hostLock(owner()->getHostMem(), owner()->getSize(),
+                                     getHostMemorySegment(memFlags));
    } else {
      deviceMemory_ = owner()->getHostMem();
    }
@@ -151,6 +151,14 @@ class Memory : public device::Memory {

  void* persistent_host_ptr_;  //!< Host accessible pointer for persistent memory

+  // Get MemorySegment type in terms of host memory allocation flags
+  Device::MemorySegment getHostMemorySegment(const unsigned int memFlags) {
+    return (memFlags & CL_MEM_SVM_ATOMICS) == 0
+           ? Device::MemorySegment::kNoAtomics :
+           ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
+             Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics);
+  }
+
 private:
  // Disable copy constructor
  Memory(const Memory&);