SWDEV-543723 - Execute permission for kernArg buf (#728)
- Refactor deviceLocalAlloc arguments - Refactor hostAlloc code, have cleaner interface - Kern args buffer need to have execute flag set as CP enforces this on certain newer HW.
Šī revīzija ir iekļauta:
@@ -758,7 +758,9 @@ bool GraphKernelArgManager::AllocGraphKernargPool(size_t pool_size, amd::Device*
|
||||
// callback thread.
|
||||
device_ = device;
|
||||
if (device->info().largeBar_) {
|
||||
graph_kernarg_base = reinterpret_cast<address>(device->deviceLocalAlloc(pool_size));
|
||||
amd::Device::AllocationFlags flags = {};
|
||||
flags.executable_ = true;
|
||||
graph_kernarg_base = reinterpret_cast<address>(device->deviceLocalAlloc(pool_size, flags));
|
||||
device_kernarg_pool_ = true;
|
||||
} else {
|
||||
graph_kernarg_base = reinterpret_cast<address>(
|
||||
|
||||
@@ -1814,13 +1814,27 @@ class Device : public RuntimeObject {
|
||||
/**
|
||||
* @copydoc amd::Context::hostAlloc
|
||||
*/
|
||||
virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const {
|
||||
virtual void* hostAlloc(size_t size, size_t alignment,
|
||||
MemorySegment mem_seg = kNoAtomics,
|
||||
const void* agentInfo = nullptr) const {
|
||||
ShouldNotCallThis();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
virtual void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain = false,
|
||||
bool contiguous = false) const {
|
||||
//! Flags for deviceLocalAlloc method
|
||||
typedef union {
|
||||
struct {
|
||||
uint32_t atomics_ : 1; //!< True if atomics support is required
|
||||
uint32_t pseudo_fine_grain_ : 1; //!< True if pseudo fine grain memory is required
|
||||
uint32_t contiguous_ : 1; //!< True if contiguous memory allocation is required
|
||||
uint32_t executable_ : 1; //!< True if executable memory is required
|
||||
uint32_t reserved_ : 28; //!< Reserved for future use
|
||||
};
|
||||
uint32_t data_;
|
||||
} AllocationFlags;
|
||||
|
||||
virtual void* deviceLocalAlloc(
|
||||
size_t size, const AllocationFlags& flags = AllocationFlags{}) const {
|
||||
ShouldNotCallThis();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -2420,7 +2420,8 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize,
|
||||
iDev()->CreateSamplerSrds(1, &samplerInfo, hwState);
|
||||
}
|
||||
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg,
|
||||
const void* agentInfo) const {
|
||||
// for discrete gpu, we only reserve,no commit yet.
|
||||
return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE);
|
||||
}
|
||||
|
||||
@@ -535,7 +535,8 @@ class Device : public NullDevice {
|
||||
) const;
|
||||
|
||||
//! host memory alloc
|
||||
virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const;
|
||||
virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics,
|
||||
const void* agentInfo = nullptr) const;
|
||||
|
||||
//! SVM allocation
|
||||
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
|
||||
|
||||
@@ -2006,10 +2006,15 @@ hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg,
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg,
|
||||
const AgentInfo* agentInfo) const {
|
||||
void* ptr = nullptr;
|
||||
hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg);
|
||||
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
|
||||
uint32_t memFlags = 0;
|
||||
if (mem_seg == kKernArg) {
|
||||
memFlags |= HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
|
||||
}
|
||||
hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, agentInfo);
|
||||
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, memFlags, &ptr);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
|
||||
"Allocate hsa host memory %p, size 0x%zx,"
|
||||
" numa_node = %d, mem_seg = %d",
|
||||
@@ -2029,32 +2034,11 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) co
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const {
|
||||
void* ptr = nullptr;
|
||||
hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, &agentInfo);
|
||||
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
|
||||
if (stat != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("Fail allocation host memory with err %d", stat);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], nullptr, ptr);
|
||||
if (stat != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("Fail hsa_amd_agents_allow_access with err %d", stat);
|
||||
hostFree(ptr, size);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
|
||||
void* ptr = nullptr;
|
||||
#ifndef ROCCLR_SUPPORT_NUMA_POLICY
|
||||
ptr = hostAlloc(size, alignment, mem_seg);
|
||||
ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
|
||||
#else
|
||||
int mode = MPOL_DEFAULT;
|
||||
int maxNodes = numa_num_possible_nodes();
|
||||
@@ -2077,14 +2061,14 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
|
||||
// We only care about the first CPU node
|
||||
for (unsigned int i = 0; i < cpuCount; i++) {
|
||||
if ((1u << i) & *nodeMask->maskp) {
|
||||
ptr = hostAgentAlloc(size, cpu_agents_[i], mem_seg);
|
||||
ptr = hostAlloc(size, alignment, mem_seg, &cpu_agents_[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// All other modes fall back to default mode
|
||||
ptr = hostAlloc(size, alignment, mem_seg);
|
||||
ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
|
||||
}
|
||||
numa_free_cpumask(nodeMask);
|
||||
#endif // ROCCLR_SUPPORT_NUMA_POLICY
|
||||
@@ -2182,12 +2166,12 @@ void Device::releaseMemory(void* ptr, size_t size) const {
|
||||
}
|
||||
}
|
||||
|
||||
void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain,
|
||||
bool contiguous) const {
|
||||
void* Device::deviceLocalAlloc(size_t size, const AllocationFlags& flags) const {
|
||||
const hsa_amd_memory_pool_t& pool =
|
||||
(pseudo_fine_grain && gpu_ext_fine_grained_segment_.handle) ? gpu_ext_fine_grained_segment_
|
||||
: (atomics && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_
|
||||
: gpuvm_segment_;
|
||||
(flags.pseudo_fine_grain_ && gpu_ext_fine_grained_segment_.handle)
|
||||
? gpu_ext_fine_grained_segment_
|
||||
: (flags.atomics_ && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_
|
||||
: gpuvm_segment_;
|
||||
|
||||
if (pool.handle == 0 || gpuvm_segment_max_alloc_ == 0) {
|
||||
DevLogPrintfError("Invalid argument, pool_handle: 0x%x , max_alloc: %u \n", pool.handle,
|
||||
@@ -2196,9 +2180,12 @@ void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain
|
||||
}
|
||||
|
||||
uint32_t hsa_mem_flags = 0;
|
||||
if (contiguous) {
|
||||
if (flags.contiguous_) {
|
||||
hsa_mem_flags = HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG;
|
||||
}
|
||||
if (flags.executable_) {
|
||||
hsa_mem_flags |= HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
|
||||
}
|
||||
|
||||
void* ptr = nullptr;
|
||||
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, hsa_mem_flags, &ptr);
|
||||
|
||||
@@ -412,7 +412,8 @@ class Device : public NullDevice {
|
||||
//! Gets free memory on a GPU device
|
||||
virtual bool globalFreeMemory(size_t* freeMemory) const;
|
||||
virtual void* hostAlloc(size_t size, size_t alignment,
|
||||
MemorySegment mem_seg = MemorySegment::kNoAtomics) const;
|
||||
MemorySegment mem_seg = MemorySegment::kNoAtomics,
|
||||
const AgentInfo* agentInfo = nullptr) const; // nullptr uses default CPU agent
|
||||
virtual void hostFree(void* ptr, size_t size = 0) const;
|
||||
|
||||
bool deviceAllowAccess(void* dst) const;
|
||||
@@ -420,8 +421,9 @@ class Device : public NullDevice {
|
||||
bool allowPeerAccess(device::Memory* memory) const;
|
||||
void deviceVmemRelease(uint64_t mem_handle) const;
|
||||
uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const;
|
||||
void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain = false,
|
||||
bool contiguous = false) const;
|
||||
|
||||
void* deviceLocalAlloc(size_t size,
|
||||
const AllocationFlags& flags = AllocationFlags{}) const;
|
||||
void* reserveMemory(size_t size, size_t alignment) const;
|
||||
void releaseMemory(void* ptr, size_t size) const;
|
||||
void memFree(void* ptr, size_t size) const;
|
||||
@@ -463,9 +465,6 @@ class Device : public NullDevice {
|
||||
//! Allocate host memory in terms of numa policy set by user
|
||||
void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;
|
||||
|
||||
//! Allocate host memory from agent info
|
||||
void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const;
|
||||
|
||||
//! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
|
||||
//! return a new device pointer accessible by the GPU agent.
|
||||
void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const;
|
||||
|
||||
@@ -854,9 +854,11 @@ bool Buffer::create(bool alloc_local) {
|
||||
}
|
||||
} else {
|
||||
assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
|
||||
deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0,
|
||||
(memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0,
|
||||
(memFlags & ROCCLR_MEM_HSA_CONTIGUOUS) != 0);
|
||||
amd::Device::AllocationFlags flags = {};
|
||||
flags.atomics_ = (memFlags & CL_MEM_SVM_ATOMICS) != 0;
|
||||
flags.pseudo_fine_grain_ = (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0;
|
||||
flags.contiguous_ = (memFlags & ROCCLR_MEM_HSA_CONTIGUOUS) != 0;
|
||||
deviceMemory_ = dev().deviceLocalAlloc(size(), flags);
|
||||
}
|
||||
owner()->setSvmPtr(deviceMemory_);
|
||||
} else {
|
||||
|
||||
@@ -1582,7 +1582,9 @@ bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) {
|
||||
if (mem_segment == Device::MemorySegment::kKernArg &&
|
||||
(gpu_.dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) &&
|
||||
gpu_.dev().info().largeBar_) {
|
||||
pool_base_ = reinterpret_cast<address>(gpu_.dev().deviceLocalAlloc(pool_size_));
|
||||
amd::Device::AllocationFlags flags = {};
|
||||
flags.executable_ = true;
|
||||
pool_base_ = reinterpret_cast<address>(gpu_.dev().deviceLocalAlloc(pool_size_, flags));
|
||||
if (pool_base_ != nullptr) {
|
||||
// @note Workaround first access penalty.
|
||||
// KFD may update CPU page tables on the first CPU access
|
||||
|
||||
Atsaukties uz šo jaunā problēmā
Block a user