SWDEV-539145 - Simplify host memory pool management (#668)

* SWDEV-539145 - Simplify host memory pool management

Remove unnecessary variables and functions.
Make code simpler and clear.

* Change cpu_agent_info_ into pointer.

* Restore getPreferredNumaNode()
Tento commit je obsažen v:
Sang, Tao
2025-07-11 10:38:40 -04:00
odevzdal GitHub
rodič b568971718
revize 1351cd7fa8
4 změnil soubory, kde provedl 77 přidání a 99 odebrání
+54 -53
Zobrazit soubor
@@ -191,11 +191,8 @@ Device::Device(hsa_agent_t bkendDevice)
, numOfVgpus_(0)
, preferred_numa_node_(0)
, maxSdmaReadMask_(0)
, maxSdmaWriteMask_(0) {
, maxSdmaWriteMask_(0), cpu_agent_info_(nullptr) {
group_segment_.handle = 0;
system_segment_.handle = 0;
system_coarse_segment_.handle = 0;
system_kernarg_segment_.handle = 0;
gpuvm_segment_.handle = 0;
gpu_fine_grained_segment_.handle = 0;
gpu_ext_fine_grained_segment_.handle = 0;
@@ -225,20 +222,20 @@ void Device::setupCpuAgent() {
}
preferred_numa_node_ = index;
cpu_agent_ = cpu_agents_[index].agent;
system_segment_ = cpu_agents_[index].fine_grain_pool;
system_coarse_segment_ = cpu_agents_[index].coarse_grain_pool;
system_kernarg_segment_ = cpu_agents_[index].kern_arg_pool;
system_ext_segment_ = cpu_agents_[index].ext_fine_grain_pool;
cpu_agent_info_ = &cpu_agents_[index];
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Numa selects cpu agent[%zu]=0x%zx(fine=0x%zx,"
"coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d", index, cpu_agent_.handle,
system_segment_.handle, system_coarse_segment_.handle, bkendDevice_.handle, isXgmi_);
"coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d", index,
cpu_agent_info_->agent.handle,
cpu_agent_info_->fine_grain_pool.handle,
cpu_agent_info_->coarse_grain_pool.handle,
bkendDevice_.handle, isXgmi_);
}
void Device::checkAtomicSupport() {
std::vector<amd::Device::LinkAttrType> link_attrs;
link_attrs.push_back(std::make_pair(LinkAttribute::kLinkAtomicSupport, 0));
if (findLinkInfo(system_segment_, &link_attrs)) {
if (findLinkInfo(cpu_agent_info_->fine_grain_pool, &link_attrs)) {
if (link_attrs[0].second == 1) {
info_.pcie_atomics_ = true;
}
@@ -863,7 +860,7 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
// If cpu agent cannot access this pool, the device does not support large bar.
hsa_amd_memory_pool_access_t tmp{};
hsa_amd_agent_memory_pool_get_info(
dev->cpu_agent_,
dev->cpu_agent_info_->agent,
pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
&tmp);
@@ -1166,7 +1163,7 @@ bool Device::populateOCLDeviceConstants() {
checkAtomicSupport();
assert(system_segment_.handle != 0);
assert(cpu_agent_info_->fine_grain_pool.handle != 0);
if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
bkendDevice_, Device::iterateGpuMemoryPoolCallback, this)) {
return false;
@@ -1286,7 +1283,8 @@ bool Device::populateOCLDeviceConstants() {
if (HSA_STATUS_SUCCESS !=
hsa_amd_memory_pool_get_info(
system_segment_, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &alloc_granularity_)) {
cpu_agent_info_->fine_grain_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
&alloc_granularity_)) {
return false;
}
}
@@ -2005,46 +2003,54 @@ device::Memory* Device::createMemory(size_t size, size_t alignment) const {
}
// ================================================================================================
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
void* ptr = nullptr;
hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg,
const AgentInfo* agentInfo) const {
if (agentInfo == nullptr) {
agentInfo = cpu_agent_info_;
}
hsa_amd_memory_pool_t segment{0};
switch (mem_seg) {
case kKernArg : {
if (settings().fgs_kernel_arg_) {
segment = system_kernarg_segment_;
segment = agentInfo->kern_arg_pool;
break;
}
// Falls through on else case.
}
case kNoAtomics :
// If runtime disables barrier, then all host allocations must have L2 disabled
if (system_coarse_segment_.handle != 0) {
segment = system_coarse_segment_;
if (agentInfo->coarse_grain_pool.handle != 0) {
segment = agentInfo->coarse_grain_pool;
break;
}
// Falls through on else case.
case kAtomics :
segment = system_segment_;
segment = agentInfo->fine_grain_pool;
break;
case kUncachedAtomics :
if (system_ext_segment_.handle != 0) {
if (agentInfo->ext_fine_grain_pool.handle != 0) {
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
"Using extended fine grained access system memory pool");
segment = system_ext_segment_;
segment = agentInfo->ext_fine_grain_pool;
} else {
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
"Falling through on fine grained access system memory pool");
segment = system_segment_;
segment = agentInfo->fine_grain_pool;
}
break;
default :
guarantee(false, "Invalid Memory Segment");
break;
}
assert(segment.handle != 0);
hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
return segment;
}
// ================================================================================================
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
void* ptr = nullptr;
hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg);
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx,"
" numa_node = %d, mem_seg = %d", ptr, size, preferred_numa_node_, static_cast<int>(mem_seg));
if (stat != HSA_STATUS_SUCCESS) {
@@ -2065,28 +2071,8 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) co
// ================================================================================================
void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const {
void* ptr = nullptr;
hsa_amd_memory_pool_t segment = agentInfo.fine_grain_pool;
switch (mem_seg) {
case kNoAtomics :
if (agentInfo.coarse_grain_pool.handle != 0) {
segment = agentInfo.coarse_grain_pool;
}
break;
case kUncachedAtomics :
if (agentInfo.ext_fine_grain_pool.handle != 0) {
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
"Using extended fine grained access system memory pool in hostAgentAlloc");
segment = agentInfo.ext_fine_grain_pool;
} else {
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
"Falling through on fine grained access system memory pool in hostAgentAlloc");
}
break;
default :
break;
}
assert(segment.handle != 0);
hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, &agentInfo);
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
if (stat != HSA_STATUS_SUCCESS) {
LogPrintfError("Fail allocation host memory with err %d", stat);
@@ -2144,6 +2130,21 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
return ptr;
}
void* Device::hostLock(void* hostMem, size_t size, const MemorySegment memSegment) const {
hsa_amd_memory_pool_t pool = getHostMemoryPool(memSegment);
void *deviceMemory = nullptr;
hsa_status_t status = hsa_amd_memory_lock_to_pool(hostMem, size,
const_cast<hsa_agent_t*>(&bkendDevice_), 1, pool, 0, &deviceMemory);
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Locking to pool %p, size 0x%zx, hostMem = %p,"
" deviceMemory = %p, memSegment = %d", pool, size, hostMem, deviceMemory,
static_cast<int>(memSegment));
if (status != HSA_STATUS_SUCCESS) {
DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status);
deviceMemory = nullptr;
}
return deviceMemory;
}
void Device::hostFree(void* ptr, size_t size) const { memFree(ptr, size); }
bool Device::deviceAllowAccess(void* ptr) const {
@@ -2585,11 +2586,11 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
case amd::MemRangeAttribute::AccessedBy:
accessed_by = attr.size();
// Add all GPU devices into the query
for (const auto agent : getGpuAgents()) {
for (const auto agent : gpu_agents_) {
attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent.handle});
}
// Add CPU devices
for (const auto agent_info : getCpuAgents()) {
for (const auto agent_info : cpu_agents_) {
attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent_info.agent.handle});
}
accessed_by = attr.size() - accessed_by;
@@ -2643,7 +2644,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
}
}
// Find CPU agent returned by ROCr
for (auto& agent_info : getCpuAgents()) {
for (auto& agent_info : cpu_agents_) {
if (agent_info.agent.handle == it.value) {
*reinterpret_cast<int32_t*>(data[idx]) = static_cast<int32_t>(amd::CpuDeviceId);
}
@@ -2678,7 +2679,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
}
}
// Find CPU agent returned by ROCr
for (auto& agent_info : getCpuAgents()) {
for (auto& agent_info : cpu_agents_) {
if (agent_info.agent.handle == it.value) {
reinterpret_cast<int32_t*>(data[idx])[entry] =
static_cast<int32_t>(amd::CpuDeviceId);
+11 -16
Zobrazit soubor
@@ -341,10 +341,8 @@ class Device : public NullDevice {
static bool loadHsaModules();
hsa_agent_t getBackendDevice() const { return bkendDevice_; }
const hsa_agent_t &getCpuAgent() const { return cpu_agent_; } // Get the CPU agent with the least NUMA distance to this GPU
static const std::vector<hsa_agent_t>& getGpuAgents() { return gpu_agents_; }
static const std::vector<AgentInfo>& getCpuAgents() { return cpu_agents_; }
//! Get the CPU agent with the least NUMA distance to this GPU
const hsa_agent_t &getCpuAgent() const { return cpu_agent_info_->agent; }
void setupCpuAgent(); // Setup the CPU agent which has the least NUMA distance to this GPU
@@ -408,7 +406,6 @@ class Device : public NullDevice {
virtual bool globalFreeMemory(size_t* freeMemory) const;
virtual void* hostAlloc(size_t size, size_t alignment,
MemorySegment mem_seg = MemorySegment::kNoAtomics) const;
virtual void hostFree(void* ptr, size_t size = 0) const;
bool deviceAllowAccess(void* dst) const;
@@ -459,6 +456,10 @@ class Device : public NullDevice {
//! Allocate host memory from agent info
void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const;
//! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
//! return a new device pointer accessible by the GPU agent.
void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const;
//! Returns transfer engine object
const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); }
@@ -501,10 +502,6 @@ class Device : public NullDevice {
VirtualGPU* xferQueue() const;
hsa_amd_memory_pool_t SystemSegment() const { return system_segment_; }
hsa_amd_memory_pool_t SystemExtSegment() const { return system_ext_segment_; }
hsa_amd_memory_pool_t SystemCoarseSegment() const { return system_coarse_segment_; }
//! Acquire HSA queue. This method can create a new HSA queue or
//! share previously created
hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false,
@@ -547,6 +544,7 @@ class Device : public NullDevice {
virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0);
const uint32_t getPreferredNumaNode() const { return preferred_numa_node_; }
const bool isFineGrainSupported() const;
//! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
@@ -588,8 +586,6 @@ class Device : public NullDevice {
static bool isHsaInitialized_;
static std::vector<hsa_agent_t> gpu_agents_;
static std::vector<AgentInfo> cpu_agents_;
hsa_agent_t cpu_agent_;
uint32_t preferred_numa_node_;
std::vector<hsa_agent_t> p2p_agents_; //!< List of P2P agents available for this device
mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls
@@ -598,10 +594,8 @@ class Device : public NullDevice {
hsa_agent_t* p2p_agents_list_ = nullptr;
hsa_profile_t agent_profile_;
hsa_amd_memory_pool_t group_segment_;
hsa_amd_memory_pool_t system_segment_;
hsa_amd_memory_pool_t system_coarse_segment_;
hsa_amd_memory_pool_t system_kernarg_segment_;
hsa_amd_memory_pool_t system_ext_segment_;
AgentInfo *cpu_agent_info_;
hsa_amd_memory_pool_t gpuvm_segment_;
hsa_amd_memory_pool_t gpu_fine_grained_segment_;
@@ -649,7 +643,8 @@ class Device : public NullDevice {
//! Pool of HSA queues with custom CU masks
std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queueWithCUMaskPool_;
hsa_amd_memory_pool_t getHostMemoryPool(MemorySegment mem_seg,
const AgentInfo* agentInfo = nullptr) const;
//! Read and Write mask for device<->host
uint32_t maxSdmaReadMask_;
uint32_t maxSdmaWriteMask_;
+4 -30
Zobrazit soubor
@@ -825,10 +825,7 @@ bool Buffer::create(bool alloc_local) {
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
}
} else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) {
deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) == 0
? Device::MemorySegment::kNoAtomics :
((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics));
deviceMemory_ = dev().hostNumaAlloc(size(), 1, getHostMemorySegment(memFlags));
} else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) {
// TODO: ROCr will introduce a new attribute enum that implies a non-blocking signal,
// replace "HSA_AMD_SIGNAL_AMD_GPU_ONLY" with this new enum when it is ready.
@@ -852,10 +849,7 @@ bool Buffer::create(bool alloc_local) {
// Disable host access to force blit path for memeory writes.
flags_ &= ~HostMemoryDirectAccess;
} else {
deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) == 0
? Device::MemorySegment::kNoAtomics :
((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics));
deviceMemory_ = dev().hostAlloc(size(), 1, getHostMemorySegment(memFlags));
}
} else {
assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
@@ -1012,28 +1006,8 @@ bool Buffer::create(bool alloc_local) {
owner()->setHostMem(deviceMemory_);
} else if (owner()->getSvmPtr() != owner()->getHostMem()) {
if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
hsa_amd_memory_pool_t pool = dev().SystemSegment(); // Default
if ((memFlags & CL_MEM_SVM_ATOMICS) == 0) {
if (dev().SystemCoarseSegment().handle != 0) {
pool = dev().SystemCoarseSegment();
}
} else if ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0) {
if (dev().SystemExtSegment().handle != 0) {
pool = dev().SystemExtSegment();
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
"Using extended fine grained access system memory pool to lock");
}
}
hsa_agent_t hsa_agent = dev().getBackendDevice();
hsa_status_t status = hsa_amd_memory_lock_to_pool(owner()->getHostMem(),
owner()->getSize(), &hsa_agent, 1, pool, 0, &deviceMemory_);
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Locking to pool %p, size 0x%zx, HostPtr = %p,"
" DevPtr = %p, memFlags = 0x%xh", pool, owner()->getSize(),
owner()->getHostMem(), deviceMemory_, memFlags);
if (status != HSA_STATUS_SUCCESS) {
DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status);
deviceMemory_ = nullptr;
}
deviceMemory_ = dev().hostLock(owner()->getHostMem(), owner()->getSize(),
getHostMemorySegment(memFlags));
} else {
deviceMemory_ = owner()->getHostMem();
}
+8
Zobrazit soubor
@@ -151,6 +151,14 @@ class Memory : public device::Memory {
void* persistent_host_ptr_; //!< Host accessible pointer for persistent memory
// Get MemorySegment type in terms of host memory allocation flags
Device::MemorySegment getHostMemorySegment(const unsigned int memFlags) {
return (memFlags & CL_MEM_SVM_ATOMICS) == 0
? Device::MemorySegment::kNoAtomics :
((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics);
}
private:
// Disable copy constructor
Memory(const Memory&);