From 1351cd7fa802efde2d4d2e5e6a904ba863c60532 Mon Sep 17 00:00:00 2001 From: "Sang, Tao" Date: Fri, 11 Jul 2025 10:38:40 -0400 Subject: [PATCH] SWDEV-539145 - Simplify host memory pool management (#668) * SWDEV-539145 - Simplify host memory pool management Remove unnecessary variables and functions. Make code simpler and clear. * Change cpu_agent_info_ into pointer. * Restore getPreferredNumaNode() --- rocclr/device/rocm/rocdevice.cpp | 107 ++++++++++++++++--------------- rocclr/device/rocm/rocdevice.hpp | 27 ++++---- rocclr/device/rocm/rocmemory.cpp | 34 ++-------- rocclr/device/rocm/rocmemory.hpp | 8 +++ 4 files changed, 77 insertions(+), 99 deletions(-) diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 7ada04d7ea..02b2cc9e04 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -191,11 +191,8 @@ Device::Device(hsa_agent_t bkendDevice) , numOfVgpus_(0) , preferred_numa_node_(0) , maxSdmaReadMask_(0) - , maxSdmaWriteMask_(0) { + , maxSdmaWriteMask_(0), cpu_agent_info_(nullptr) { group_segment_.handle = 0; - system_segment_.handle = 0; - system_coarse_segment_.handle = 0; - system_kernarg_segment_.handle = 0; gpuvm_segment_.handle = 0; gpu_fine_grained_segment_.handle = 0; gpu_ext_fine_grained_segment_.handle = 0; @@ -225,20 +222,20 @@ void Device::setupCpuAgent() { } preferred_numa_node_ = index; - cpu_agent_ = cpu_agents_[index].agent; - system_segment_ = cpu_agents_[index].fine_grain_pool; - system_coarse_segment_ = cpu_agents_[index].coarse_grain_pool; - system_kernarg_segment_ = cpu_agents_[index].kern_arg_pool; - system_ext_segment_ = cpu_agents_[index].ext_fine_grain_pool; + cpu_agent_info_ = &cpu_agents_[index]; + ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Numa selects cpu agent[%zu]=0x%zx(fine=0x%zx," - "coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d", index, cpu_agent_.handle, - system_segment_.handle, system_coarse_segment_.handle, bkendDevice_.handle, isXgmi_); + "coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d", index, + cpu_agent_info_->agent.handle, + cpu_agent_info_->fine_grain_pool.handle, + cpu_agent_info_->coarse_grain_pool.handle, + bkendDevice_.handle, isXgmi_); } void Device::checkAtomicSupport() { std::vector link_attrs; link_attrs.push_back(std::make_pair(LinkAttribute::kLinkAtomicSupport, 0)); - if (findLinkInfo(system_segment_, &link_attrs)) { + if (findLinkInfo(cpu_agent_info_->fine_grain_pool, &link_attrs)) { if (link_attrs[0].second == 1) { info_.pcie_atomics_ = true; } @@ -863,7 +860,7 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo // If cpu agent cannot access this pool, the device does not support large bar. hsa_amd_memory_pool_access_t tmp{}; hsa_amd_agent_memory_pool_get_info( - dev->cpu_agent_, + dev->cpu_agent_info_->agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &tmp); @@ -1166,7 +1163,7 @@ bool Device::populateOCLDeviceConstants() { checkAtomicSupport(); - assert(system_segment_.handle != 0); + assert(cpu_agent_info_->fine_grain_pool.handle != 0); if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools( bkendDevice_, Device::iterateGpuMemoryPoolCallback, this)) { return false; @@ -1286,7 +1283,8 @@ bool Device::populateOCLDeviceConstants() { if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info( - system_segment_, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &alloc_granularity_)) { + cpu_agent_info_->fine_grain_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, + &alloc_granularity_)) { return false; } } @@ -2005,46 +2003,54 @@ device::Memory* Device::createMemory(size_t size, size_t alignment) const { } // ================================================================================================ -void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const { - void* ptr = nullptr; - +hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg, + const AgentInfo* agentInfo) const { + if (agentInfo == nullptr) { + agentInfo = cpu_agent_info_; + } hsa_amd_memory_pool_t segment{0}; switch (mem_seg) { case kKernArg : { if (settings().fgs_kernel_arg_) { - segment = system_kernarg_segment_; + segment = agentInfo->kern_arg_pool; break; } // Falls through on else case. } case kNoAtomics : // If runtime disables barrier, then all host allocations must have L2 disabled - if (system_coarse_segment_.handle != 0) { - segment = system_coarse_segment_; + if (agentInfo->coarse_grain_pool.handle != 0) { + segment = agentInfo->coarse_grain_pool; break; } // Falls through on else case. case kAtomics : - segment = system_segment_; + segment = agentInfo->fine_grain_pool; break; case kUncachedAtomics : - if (system_ext_segment_.handle != 0) { + if (agentInfo->ext_fine_grain_pool.handle != 0) { ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Using extended fine grained access system memory pool"); - segment = system_ext_segment_; + segment = agentInfo->ext_fine_grain_pool; } else { ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Falling through on fine grained access system memory pool"); - segment = system_segment_; + segment = agentInfo->fine_grain_pool; } break; default : guarantee(false, "Invalid Memory Segment"); break; } - assert(segment.handle != 0); - hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr); + return segment; +} + +// ================================================================================================ +void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const { + void* ptr = nullptr; + hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg); + hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr); ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx," " numa_node = %d, mem_seg = %d", ptr, size, preferred_numa_node_, static_cast(mem_seg)); if (stat != HSA_STATUS_SUCCESS) { @@ -2065,28 +2071,8 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) co // ================================================================================================ void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const { void* ptr = nullptr; - hsa_amd_memory_pool_t segment = agentInfo.fine_grain_pool; - switch (mem_seg) { - case kNoAtomics : - if (agentInfo.coarse_grain_pool.handle != 0) { - segment = agentInfo.coarse_grain_pool; - } - break; - case kUncachedAtomics : - if (agentInfo.ext_fine_grain_pool.handle != 0) { - ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, - "Using extended fine grained access system memory pool in hostAgentAlloc"); - segment = agentInfo.ext_fine_grain_pool; - } else { - ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, - "Falling through on fine grained access system memory pool in hostAgentAlloc"); - } - break; - default : - break; - } - assert(segment.handle != 0); - hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr); + hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, &agentInfo); + hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr); ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size); if (stat != HSA_STATUS_SUCCESS) { LogPrintfError("Fail allocation host memory with err %d", stat); @@ -2144,6 +2130,21 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg return ptr; } +void* Device::hostLock(void* hostMem, size_t size, const MemorySegment memSegment) const { + hsa_amd_memory_pool_t pool = getHostMemoryPool(memSegment); + void *deviceMemory = nullptr; + hsa_status_t status = hsa_amd_memory_lock_to_pool(hostMem, size, + const_cast(&bkendDevice_), 1, pool, 0, &deviceMemory); + ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Locking to pool %p, size 0x%zx, hostMem = %p," + " deviceMemory = %p, memSegment = %d", pool, size, hostMem, deviceMemory, + static_cast(memSegment)); + if (status != HSA_STATUS_SUCCESS) { + DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status); + deviceMemory = nullptr; + } + return deviceMemory; +} + void Device::hostFree(void* ptr, size_t size) const { memFree(ptr, size); } bool Device::deviceAllowAccess(void* ptr) const { @@ -2585,11 +2586,11 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes, case amd::MemRangeAttribute::AccessedBy: accessed_by = attr.size(); // Add all GPU devices into the query - for (const auto agent : getGpuAgents()) { + for (const auto agent : gpu_agents_) { attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent.handle}); } // Add CPU devices - for (const auto agent_info : getCpuAgents()) { + for (const auto agent_info : cpu_agents_) { attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent_info.agent.handle}); } accessed_by = attr.size() - accessed_by; @@ -2643,7 +2644,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes, } } // Find CPU agent returned by ROCr - for (auto& agent_info : getCpuAgents()) { + for (auto& agent_info : cpu_agents_) { if (agent_info.agent.handle == it.value) { *reinterpret_cast(data[idx]) = static_cast(amd::CpuDeviceId); } @@ -2678,7 +2679,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes, } } // Find CPU agent returned by ROCr - for (auto& agent_info : getCpuAgents()) { + for (auto& agent_info : cpu_agents_) { if (agent_info.agent.handle == it.value) { reinterpret_cast(data[idx])[entry] = static_cast(amd::CpuDeviceId); diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index 140431ec1b..85b10edb6e 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -341,10 +341,8 @@ class Device : public NullDevice { static bool loadHsaModules(); hsa_agent_t getBackendDevice() const { return bkendDevice_; } - const hsa_agent_t &getCpuAgent() const { return cpu_agent_; } // Get the CPU agent with the least NUMA distance to this GPU - - static const std::vector& getGpuAgents() { return gpu_agents_; } - static const std::vector& getCpuAgents() { return cpu_agents_; } + //! Get the CPU agent with the least NUMA distance to this GPU + const hsa_agent_t &getCpuAgent() const { return cpu_agent_info_->agent; } void setupCpuAgent(); // Setup the CPU agent which has the least NUMA distance to this GPU @@ -408,7 +406,6 @@ class Device : public NullDevice { virtual bool globalFreeMemory(size_t* freeMemory) const; virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = MemorySegment::kNoAtomics) const; - virtual void hostFree(void* ptr, size_t size = 0) const; bool deviceAllowAccess(void* dst) const; @@ -459,6 +456,10 @@ class Device : public NullDevice { //! Allocate host memory from agent info void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const; + //! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and + //! return a new device pointer accessible by the GPU agent. + void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const; + //! Returns transfer engine object const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); } @@ -501,10 +502,6 @@ class Device : public NullDevice { VirtualGPU* xferQueue() const; - hsa_amd_memory_pool_t SystemSegment() const { return system_segment_; } - hsa_amd_memory_pool_t SystemExtSegment() const { return system_ext_segment_; } - hsa_amd_memory_pool_t SystemCoarseSegment() const { return system_coarse_segment_; } - //! Acquire HSA queue. This method can create a new HSA queue or //! share previously created hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false, @@ -547,6 +544,7 @@ class Device : public NullDevice { virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0); const uint32_t getPreferredNumaNode() const { return preferred_numa_node_; } + const bool isFineGrainSupported() const; //! Returns True if memory pointer is known to ROCr (excludes HMM allocations) @@ -588,8 +586,6 @@ class Device : public NullDevice { static bool isHsaInitialized_; static std::vector gpu_agents_; static std::vector cpu_agents_; - - hsa_agent_t cpu_agent_; uint32_t preferred_numa_node_; std::vector p2p_agents_; //!< List of P2P agents available for this device mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls @@ -598,10 +594,8 @@ class Device : public NullDevice { hsa_agent_t* p2p_agents_list_ = nullptr; hsa_profile_t agent_profile_; hsa_amd_memory_pool_t group_segment_; - hsa_amd_memory_pool_t system_segment_; - hsa_amd_memory_pool_t system_coarse_segment_; - hsa_amd_memory_pool_t system_kernarg_segment_; - hsa_amd_memory_pool_t system_ext_segment_; + + AgentInfo *cpu_agent_info_; hsa_amd_memory_pool_t gpuvm_segment_; hsa_amd_memory_pool_t gpu_fine_grained_segment_; @@ -649,7 +643,8 @@ class Device : public NullDevice { //! Pool of HSA queues with custom CU masks std::vector> queueWithCUMaskPool_; - + hsa_amd_memory_pool_t getHostMemoryPool(MemorySegment mem_seg, + const AgentInfo* agentInfo = nullptr) const; //! Read and Write mask for device<->host uint32_t maxSdmaReadMask_; uint32_t maxSdmaWriteMask_; diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp index 50a981ec5e..6308e28e4e 100644 --- a/rocclr/device/rocm/rocmemory.cpp +++ b/rocclr/device/rocm/rocmemory.cpp @@ -825,10 +825,7 @@ bool Buffer::create(bool alloc_local) { deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics); } } else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) { - deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) == 0 - ? Device::MemorySegment::kNoAtomics : - ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ? - Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics)); + deviceMemory_ = dev().hostNumaAlloc(size(), 1, getHostMemorySegment(memFlags)); } else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) { // TODO: ROCr will introduce a new attribute enum that implies a non-blocking signal, // replace "HSA_AMD_SIGNAL_AMD_GPU_ONLY" with this new enum when it is ready. @@ -852,10 +849,7 @@ bool Buffer::create(bool alloc_local) { // Disable host access to force blit path for memeory writes. flags_ &= ~HostMemoryDirectAccess; } else { - deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) == 0 - ? Device::MemorySegment::kNoAtomics : - ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ? - Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics)); + deviceMemory_ = dev().hostAlloc(size(), 1, getHostMemorySegment(memFlags)); } } else { assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!"); @@ -1012,28 +1006,8 @@ bool Buffer::create(bool alloc_local) { owner()->setHostMem(deviceMemory_); } else if (owner()->getSvmPtr() != owner()->getHostMem()) { if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { - hsa_amd_memory_pool_t pool = dev().SystemSegment(); // Default - if ((memFlags & CL_MEM_SVM_ATOMICS) == 0) { - if (dev().SystemCoarseSegment().handle != 0) { - pool = dev().SystemCoarseSegment(); - } - } else if ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0) { - if (dev().SystemExtSegment().handle != 0) { - pool = dev().SystemExtSegment(); - ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, - "Using extended fine grained access system memory pool to lock"); - } - } - hsa_agent_t hsa_agent = dev().getBackendDevice(); - hsa_status_t status = hsa_amd_memory_lock_to_pool(owner()->getHostMem(), - owner()->getSize(), &hsa_agent, 1, pool, 0, &deviceMemory_); - ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Locking to pool %p, size 0x%zx, HostPtr = %p," - " DevPtr = %p, memFlags = 0x%xh", pool, owner()->getSize(), - owner()->getHostMem(), deviceMemory_, memFlags); - if (status != HSA_STATUS_SUCCESS) { - DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status); - deviceMemory_ = nullptr; - } + deviceMemory_ = dev().hostLock(owner()->getHostMem(), owner()->getSize(), + getHostMemorySegment(memFlags)); } else { deviceMemory_ = owner()->getHostMem(); } diff --git a/rocclr/device/rocm/rocmemory.hpp b/rocclr/device/rocm/rocmemory.hpp index 262be44457..746ee4fc00 100644 --- a/rocclr/device/rocm/rocmemory.hpp +++ b/rocclr/device/rocm/rocmemory.hpp @@ -151,6 +151,14 @@ class Memory : public device::Memory { void* persistent_host_ptr_; //!< Host accessible pointer for persistent memory + // Get MemorySegment type in terms of host memory allocation flags + Device::MemorySegment getHostMemorySegment(const unsigned int memFlags) { + return (memFlags & CL_MEM_SVM_ATOMICS) == 0 + ? Device::MemorySegment::kNoAtomics : + ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ? + Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics); + } + private: // Disable copy constructor Memory(const Memory&);