SWDEV-539145 - Simplify host memory pool management (#668)
* SWDEV-539145 - Simplify host memory pool management Remove unnecessary variables and functions. Make code simpler and clear. * Change cpu_agent_info_ into pointer. * Restore getPreferredNumaNode()
Tento commit je obsažen v:
@@ -191,11 +191,8 @@ Device::Device(hsa_agent_t bkendDevice)
|
||||
, numOfVgpus_(0)
|
||||
, preferred_numa_node_(0)
|
||||
, maxSdmaReadMask_(0)
|
||||
, maxSdmaWriteMask_(0) {
|
||||
, maxSdmaWriteMask_(0), cpu_agent_info_(nullptr) {
|
||||
group_segment_.handle = 0;
|
||||
system_segment_.handle = 0;
|
||||
system_coarse_segment_.handle = 0;
|
||||
system_kernarg_segment_.handle = 0;
|
||||
gpuvm_segment_.handle = 0;
|
||||
gpu_fine_grained_segment_.handle = 0;
|
||||
gpu_ext_fine_grained_segment_.handle = 0;
|
||||
@@ -225,20 +222,20 @@ void Device::setupCpuAgent() {
|
||||
}
|
||||
|
||||
preferred_numa_node_ = index;
|
||||
cpu_agent_ = cpu_agents_[index].agent;
|
||||
system_segment_ = cpu_agents_[index].fine_grain_pool;
|
||||
system_coarse_segment_ = cpu_agents_[index].coarse_grain_pool;
|
||||
system_kernarg_segment_ = cpu_agents_[index].kern_arg_pool;
|
||||
system_ext_segment_ = cpu_agents_[index].ext_fine_grain_pool;
|
||||
cpu_agent_info_ = &cpu_agents_[index];
|
||||
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Numa selects cpu agent[%zu]=0x%zx(fine=0x%zx,"
|
||||
"coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d", index, cpu_agent_.handle,
|
||||
system_segment_.handle, system_coarse_segment_.handle, bkendDevice_.handle, isXgmi_);
|
||||
"coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d", index,
|
||||
cpu_agent_info_->agent.handle,
|
||||
cpu_agent_info_->fine_grain_pool.handle,
|
||||
cpu_agent_info_->coarse_grain_pool.handle,
|
||||
bkendDevice_.handle, isXgmi_);
|
||||
}
|
||||
|
||||
void Device::checkAtomicSupport() {
|
||||
std::vector<amd::Device::LinkAttrType> link_attrs;
|
||||
link_attrs.push_back(std::make_pair(LinkAttribute::kLinkAtomicSupport, 0));
|
||||
if (findLinkInfo(system_segment_, &link_attrs)) {
|
||||
if (findLinkInfo(cpu_agent_info_->fine_grain_pool, &link_attrs)) {
|
||||
if (link_attrs[0].second == 1) {
|
||||
info_.pcie_atomics_ = true;
|
||||
}
|
||||
@@ -863,7 +860,7 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
|
||||
// If cpu agent cannot access this pool, the device does not support large bar.
|
||||
hsa_amd_memory_pool_access_t tmp{};
|
||||
hsa_amd_agent_memory_pool_get_info(
|
||||
dev->cpu_agent_,
|
||||
dev->cpu_agent_info_->agent,
|
||||
pool,
|
||||
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
|
||||
&tmp);
|
||||
@@ -1166,7 +1163,7 @@ bool Device::populateOCLDeviceConstants() {
|
||||
|
||||
checkAtomicSupport();
|
||||
|
||||
assert(system_segment_.handle != 0);
|
||||
assert(cpu_agent_info_->fine_grain_pool.handle != 0);
|
||||
if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
|
||||
bkendDevice_, Device::iterateGpuMemoryPoolCallback, this)) {
|
||||
return false;
|
||||
@@ -1286,7 +1283,8 @@ bool Device::populateOCLDeviceConstants() {
|
||||
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_amd_memory_pool_get_info(
|
||||
system_segment_, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &alloc_granularity_)) {
|
||||
cpu_agent_info_->fine_grain_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
|
||||
&alloc_granularity_)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -2005,46 +2003,54 @@ device::Memory* Device::createMemory(size_t size, size_t alignment) const {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
|
||||
void* ptr = nullptr;
|
||||
|
||||
hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg,
|
||||
const AgentInfo* agentInfo) const {
|
||||
if (agentInfo == nullptr) {
|
||||
agentInfo = cpu_agent_info_;
|
||||
}
|
||||
hsa_amd_memory_pool_t segment{0};
|
||||
switch (mem_seg) {
|
||||
case kKernArg : {
|
||||
if (settings().fgs_kernel_arg_) {
|
||||
segment = system_kernarg_segment_;
|
||||
segment = agentInfo->kern_arg_pool;
|
||||
break;
|
||||
}
|
||||
// Falls through on else case.
|
||||
}
|
||||
case kNoAtomics :
|
||||
// If runtime disables barrier, then all host allocations must have L2 disabled
|
||||
if (system_coarse_segment_.handle != 0) {
|
||||
segment = system_coarse_segment_;
|
||||
if (agentInfo->coarse_grain_pool.handle != 0) {
|
||||
segment = agentInfo->coarse_grain_pool;
|
||||
break;
|
||||
}
|
||||
// Falls through on else case.
|
||||
case kAtomics :
|
||||
segment = system_segment_;
|
||||
segment = agentInfo->fine_grain_pool;
|
||||
break;
|
||||
case kUncachedAtomics :
|
||||
if (system_ext_segment_.handle != 0) {
|
||||
if (agentInfo->ext_fine_grain_pool.handle != 0) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
|
||||
"Using extended fine grained access system memory pool");
|
||||
segment = system_ext_segment_;
|
||||
segment = agentInfo->ext_fine_grain_pool;
|
||||
} else {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
|
||||
"Falling through on fine grained access system memory pool");
|
||||
segment = system_segment_;
|
||||
segment = agentInfo->fine_grain_pool;
|
||||
}
|
||||
break;
|
||||
default :
|
||||
guarantee(false, "Invalid Memory Segment");
|
||||
break;
|
||||
}
|
||||
|
||||
assert(segment.handle != 0);
|
||||
hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
|
||||
return segment;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
|
||||
void* ptr = nullptr;
|
||||
hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg);
|
||||
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx,"
|
||||
" numa_node = %d, mem_seg = %d", ptr, size, preferred_numa_node_, static_cast<int>(mem_seg));
|
||||
if (stat != HSA_STATUS_SUCCESS) {
|
||||
@@ -2065,28 +2071,8 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) co
|
||||
// ================================================================================================
|
||||
void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const {
|
||||
void* ptr = nullptr;
|
||||
hsa_amd_memory_pool_t segment = agentInfo.fine_grain_pool;
|
||||
switch (mem_seg) {
|
||||
case kNoAtomics :
|
||||
if (agentInfo.coarse_grain_pool.handle != 0) {
|
||||
segment = agentInfo.coarse_grain_pool;
|
||||
}
|
||||
break;
|
||||
case kUncachedAtomics :
|
||||
if (agentInfo.ext_fine_grain_pool.handle != 0) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
|
||||
"Using extended fine grained access system memory pool in hostAgentAlloc");
|
||||
segment = agentInfo.ext_fine_grain_pool;
|
||||
} else {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
|
||||
"Falling through on fine grained access system memory pool in hostAgentAlloc");
|
||||
}
|
||||
break;
|
||||
default :
|
||||
break;
|
||||
}
|
||||
assert(segment.handle != 0);
|
||||
hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
|
||||
hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, &agentInfo);
|
||||
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
|
||||
if (stat != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("Fail allocation host memory with err %d", stat);
|
||||
@@ -2144,6 +2130,21 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void* Device::hostLock(void* hostMem, size_t size, const MemorySegment memSegment) const {
|
||||
hsa_amd_memory_pool_t pool = getHostMemoryPool(memSegment);
|
||||
void *deviceMemory = nullptr;
|
||||
hsa_status_t status = hsa_amd_memory_lock_to_pool(hostMem, size,
|
||||
const_cast<hsa_agent_t*>(&bkendDevice_), 1, pool, 0, &deviceMemory);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Locking to pool %p, size 0x%zx, hostMem = %p,"
|
||||
" deviceMemory = %p, memSegment = %d", pool, size, hostMem, deviceMemory,
|
||||
static_cast<int>(memSegment));
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status);
|
||||
deviceMemory = nullptr;
|
||||
}
|
||||
return deviceMemory;
|
||||
}
|
||||
|
||||
void Device::hostFree(void* ptr, size_t size) const { memFree(ptr, size); }
|
||||
|
||||
bool Device::deviceAllowAccess(void* ptr) const {
|
||||
@@ -2585,11 +2586,11 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
|
||||
case amd::MemRangeAttribute::AccessedBy:
|
||||
accessed_by = attr.size();
|
||||
// Add all GPU devices into the query
|
||||
for (const auto agent : getGpuAgents()) {
|
||||
for (const auto agent : gpu_agents_) {
|
||||
attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent.handle});
|
||||
}
|
||||
// Add CPU devices
|
||||
for (const auto agent_info : getCpuAgents()) {
|
||||
for (const auto agent_info : cpu_agents_) {
|
||||
attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent_info.agent.handle});
|
||||
}
|
||||
accessed_by = attr.size() - accessed_by;
|
||||
@@ -2643,7 +2644,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
|
||||
}
|
||||
}
|
||||
// Find CPU agent returned by ROCr
|
||||
for (auto& agent_info : getCpuAgents()) {
|
||||
for (auto& agent_info : cpu_agents_) {
|
||||
if (agent_info.agent.handle == it.value) {
|
||||
*reinterpret_cast<int32_t*>(data[idx]) = static_cast<int32_t>(amd::CpuDeviceId);
|
||||
}
|
||||
@@ -2678,7 +2679,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
|
||||
}
|
||||
}
|
||||
// Find CPU agent returned by ROCr
|
||||
for (auto& agent_info : getCpuAgents()) {
|
||||
for (auto& agent_info : cpu_agents_) {
|
||||
if (agent_info.agent.handle == it.value) {
|
||||
reinterpret_cast<int32_t*>(data[idx])[entry] =
|
||||
static_cast<int32_t>(amd::CpuDeviceId);
|
||||
|
||||
@@ -341,10 +341,8 @@ class Device : public NullDevice {
|
||||
static bool loadHsaModules();
|
||||
|
||||
hsa_agent_t getBackendDevice() const { return bkendDevice_; }
|
||||
const hsa_agent_t &getCpuAgent() const { return cpu_agent_; } // Get the CPU agent with the least NUMA distance to this GPU
|
||||
|
||||
static const std::vector<hsa_agent_t>& getGpuAgents() { return gpu_agents_; }
|
||||
static const std::vector<AgentInfo>& getCpuAgents() { return cpu_agents_; }
|
||||
//! Get the CPU agent with the least NUMA distance to this GPU
|
||||
const hsa_agent_t &getCpuAgent() const { return cpu_agent_info_->agent; }
|
||||
|
||||
void setupCpuAgent(); // Setup the CPU agent which has the least NUMA distance to this GPU
|
||||
|
||||
@@ -408,7 +406,6 @@ class Device : public NullDevice {
|
||||
virtual bool globalFreeMemory(size_t* freeMemory) const;
|
||||
virtual void* hostAlloc(size_t size, size_t alignment,
|
||||
MemorySegment mem_seg = MemorySegment::kNoAtomics) const;
|
||||
|
||||
virtual void hostFree(void* ptr, size_t size = 0) const;
|
||||
|
||||
bool deviceAllowAccess(void* dst) const;
|
||||
@@ -459,6 +456,10 @@ class Device : public NullDevice {
|
||||
//! Allocate host memory from agent info
|
||||
void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const;
|
||||
|
||||
//! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
|
||||
//! return a new device pointer accessible by the GPU agent.
|
||||
void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const;
|
||||
|
||||
//! Returns transfer engine object
|
||||
const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); }
|
||||
|
||||
@@ -501,10 +502,6 @@ class Device : public NullDevice {
|
||||
|
||||
VirtualGPU* xferQueue() const;
|
||||
|
||||
hsa_amd_memory_pool_t SystemSegment() const { return system_segment_; }
|
||||
hsa_amd_memory_pool_t SystemExtSegment() const { return system_ext_segment_; }
|
||||
hsa_amd_memory_pool_t SystemCoarseSegment() const { return system_coarse_segment_; }
|
||||
|
||||
//! Acquire HSA queue. This method can create a new HSA queue or
|
||||
//! share previously created
|
||||
hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false,
|
||||
@@ -547,6 +544,7 @@ class Device : public NullDevice {
|
||||
virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0);
|
||||
|
||||
const uint32_t getPreferredNumaNode() const { return preferred_numa_node_; }
|
||||
|
||||
const bool isFineGrainSupported() const;
|
||||
|
||||
//! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
|
||||
@@ -588,8 +586,6 @@ class Device : public NullDevice {
|
||||
static bool isHsaInitialized_;
|
||||
static std::vector<hsa_agent_t> gpu_agents_;
|
||||
static std::vector<AgentInfo> cpu_agents_;
|
||||
|
||||
hsa_agent_t cpu_agent_;
|
||||
uint32_t preferred_numa_node_;
|
||||
std::vector<hsa_agent_t> p2p_agents_; //!< List of P2P agents available for this device
|
||||
mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls
|
||||
@@ -598,10 +594,8 @@ class Device : public NullDevice {
|
||||
hsa_agent_t* p2p_agents_list_ = nullptr;
|
||||
hsa_profile_t agent_profile_;
|
||||
hsa_amd_memory_pool_t group_segment_;
|
||||
hsa_amd_memory_pool_t system_segment_;
|
||||
hsa_amd_memory_pool_t system_coarse_segment_;
|
||||
hsa_amd_memory_pool_t system_kernarg_segment_;
|
||||
hsa_amd_memory_pool_t system_ext_segment_;
|
||||
|
||||
AgentInfo *cpu_agent_info_;
|
||||
|
||||
hsa_amd_memory_pool_t gpuvm_segment_;
|
||||
hsa_amd_memory_pool_t gpu_fine_grained_segment_;
|
||||
@@ -649,7 +643,8 @@ class Device : public NullDevice {
|
||||
|
||||
//! Pool of HSA queues with custom CU masks
|
||||
std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queueWithCUMaskPool_;
|
||||
|
||||
hsa_amd_memory_pool_t getHostMemoryPool(MemorySegment mem_seg,
|
||||
const AgentInfo* agentInfo = nullptr) const;
|
||||
//! Read and Write mask for device<->host
|
||||
uint32_t maxSdmaReadMask_;
|
||||
uint32_t maxSdmaWriteMask_;
|
||||
|
||||
@@ -825,10 +825,7 @@ bool Buffer::create(bool alloc_local) {
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
|
||||
}
|
||||
} else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) {
|
||||
deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) == 0
|
||||
? Device::MemorySegment::kNoAtomics :
|
||||
((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
|
||||
Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics));
|
||||
deviceMemory_ = dev().hostNumaAlloc(size(), 1, getHostMemorySegment(memFlags));
|
||||
} else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) {
|
||||
// TODO: ROCr will introduce a new attribute enum that implies a non-blocking signal,
|
||||
// replace "HSA_AMD_SIGNAL_AMD_GPU_ONLY" with this new enum when it is ready.
|
||||
@@ -852,10 +849,7 @@ bool Buffer::create(bool alloc_local) {
|
||||
// Disable host access to force blit path for memeory writes.
|
||||
flags_ &= ~HostMemoryDirectAccess;
|
||||
} else {
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) == 0
|
||||
? Device::MemorySegment::kNoAtomics :
|
||||
((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
|
||||
Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics));
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, getHostMemorySegment(memFlags));
|
||||
}
|
||||
} else {
|
||||
assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
|
||||
@@ -1012,28 +1006,8 @@ bool Buffer::create(bool alloc_local) {
|
||||
owner()->setHostMem(deviceMemory_);
|
||||
} else if (owner()->getSvmPtr() != owner()->getHostMem()) {
|
||||
if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
|
||||
hsa_amd_memory_pool_t pool = dev().SystemSegment(); // Default
|
||||
if ((memFlags & CL_MEM_SVM_ATOMICS) == 0) {
|
||||
if (dev().SystemCoarseSegment().handle != 0) {
|
||||
pool = dev().SystemCoarseSegment();
|
||||
}
|
||||
} else if ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0) {
|
||||
if (dev().SystemExtSegment().handle != 0) {
|
||||
pool = dev().SystemExtSegment();
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
|
||||
"Using extended fine grained access system memory pool to lock");
|
||||
}
|
||||
}
|
||||
hsa_agent_t hsa_agent = dev().getBackendDevice();
|
||||
hsa_status_t status = hsa_amd_memory_lock_to_pool(owner()->getHostMem(),
|
||||
owner()->getSize(), &hsa_agent, 1, pool, 0, &deviceMemory_);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Locking to pool %p, size 0x%zx, HostPtr = %p,"
|
||||
" DevPtr = %p, memFlags = 0x%xh", pool, owner()->getSize(),
|
||||
owner()->getHostMem(), deviceMemory_, memFlags);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status);
|
||||
deviceMemory_ = nullptr;
|
||||
}
|
||||
deviceMemory_ = dev().hostLock(owner()->getHostMem(), owner()->getSize(),
|
||||
getHostMemorySegment(memFlags));
|
||||
} else {
|
||||
deviceMemory_ = owner()->getHostMem();
|
||||
}
|
||||
|
||||
@@ -151,6 +151,14 @@ class Memory : public device::Memory {
|
||||
|
||||
void* persistent_host_ptr_; //!< Host accessible pointer for persistent memory
|
||||
|
||||
// Get MemorySegment type in terms of host memory allocation flags
|
||||
Device::MemorySegment getHostMemorySegment(const unsigned int memFlags) {
|
||||
return (memFlags & CL_MEM_SVM_ATOMICS) == 0
|
||||
? Device::MemorySegment::kNoAtomics :
|
||||
((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
|
||||
Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics);
|
||||
}
|
||||
|
||||
private:
|
||||
// Disable copy constructor
|
||||
Memory(const Memory&);
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele