SWDEV-543723 - Execute permission for kernArg buf (#728)

- Refactor deviceLocalAlloc arguments
- Refactor hostAlloc code, have cleaner interface
- Kern args buffer need to have execute flag set as CP enforces this on
  certain newer HW.
Šī revīzija ir iekļauta:
SaleelK
2025-09-08 12:21:30 -07:00
revīziju iesūtīja GitHub
vecāks dcc526e1fd
revīzija e197aa83ba
8 mainīti faili ar 57 papildinājumiem un 49 dzēšanām
@@ -758,7 +758,9 @@ bool GraphKernelArgManager::AllocGraphKernargPool(size_t pool_size, amd::Device*
// callback thread.
device_ = device;
if (device->info().largeBar_) {
graph_kernarg_base = reinterpret_cast<address>(device->deviceLocalAlloc(pool_size));
amd::Device::AllocationFlags flags = {};
flags.executable_ = true;
graph_kernarg_base = reinterpret_cast<address>(device->deviceLocalAlloc(pool_size, flags));
device_kernarg_pool_ = true;
} else {
graph_kernarg_base = reinterpret_cast<address>(
+17 -3
Parādīt failu
@@ -1814,13 +1814,27 @@ class Device : public RuntimeObject {
/**
* @copydoc amd::Context::hostAlloc
*/
virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const {
virtual void* hostAlloc(size_t size, size_t alignment,
MemorySegment mem_seg = kNoAtomics,
const void* agentInfo = nullptr) const {
ShouldNotCallThis();
return NULL;
}
virtual void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain = false,
bool contiguous = false) const {
//! Flags for deviceLocalAlloc method
typedef union {
struct {
uint32_t atomics_ : 1; //!< True if atomics support is required
uint32_t pseudo_fine_grain_ : 1; //!< True if pseudo fine grain memory is required
uint32_t contiguous_ : 1; //!< True if contiguous memory allocation is required
uint32_t executable_ : 1; //!< True if executable memory is required
uint32_t reserved_ : 28; //!< Reserved for future use
};
uint32_t data_;
} AllocationFlags;
virtual void* deviceLocalAlloc(
size_t size, const AllocationFlags& flags = AllocationFlags{}) const {
ShouldNotCallThis();
return NULL;
}
@@ -2420,7 +2420,8 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize,
iDev()->CreateSamplerSrds(1, &samplerInfo, hwState);
}
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg,
const void* agentInfo) const {
// for discrete gpu, we only reserve,no commit yet.
return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE);
}
@@ -535,7 +535,8 @@ class Device : public NullDevice {
) const;
//! host memory alloc
virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const;
virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics,
const void* agentInfo = nullptr) const;
//! SVM allocation
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
@@ -2006,10 +2006,15 @@ hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg,
}
// ================================================================================================
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg,
const AgentInfo* agentInfo) const {
void* ptr = nullptr;
hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg);
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
uint32_t memFlags = 0;
if (mem_seg == kKernArg) {
memFlags |= HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
}
hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, agentInfo);
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, memFlags, &ptr);
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
"Allocate hsa host memory %p, size 0x%zx,"
" numa_node = %d, mem_seg = %d",
@@ -2029,32 +2034,11 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) co
return ptr;
}
// ================================================================================================
void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const {
void* ptr = nullptr;
hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, &agentInfo);
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
if (stat != HSA_STATUS_SUCCESS) {
LogPrintfError("Fail allocation host memory with err %d", stat);
return nullptr;
}
stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], nullptr, ptr);
if (stat != HSA_STATUS_SUCCESS) {
LogPrintfError("Fail hsa_amd_agents_allow_access with err %d", stat);
hostFree(ptr, size);
return nullptr;
}
return ptr;
}
// ================================================================================================
void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
void* ptr = nullptr;
#ifndef ROCCLR_SUPPORT_NUMA_POLICY
ptr = hostAlloc(size, alignment, mem_seg);
ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
#else
int mode = MPOL_DEFAULT;
int maxNodes = numa_num_possible_nodes();
@@ -2077,14 +2061,14 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
// We only care about the first CPU node
for (unsigned int i = 0; i < cpuCount; i++) {
if ((1u << i) & *nodeMask->maskp) {
ptr = hostAgentAlloc(size, cpu_agents_[i], mem_seg);
ptr = hostAlloc(size, alignment, mem_seg, &cpu_agents_[i]);
break;
}
}
break;
default:
// All other modes fall back to default mode
ptr = hostAlloc(size, alignment, mem_seg);
ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
}
numa_free_cpumask(nodeMask);
#endif // ROCCLR_SUPPORT_NUMA_POLICY
@@ -2182,12 +2166,12 @@ void Device::releaseMemory(void* ptr, size_t size) const {
}
}
void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain,
bool contiguous) const {
void* Device::deviceLocalAlloc(size_t size, const AllocationFlags& flags) const {
const hsa_amd_memory_pool_t& pool =
(pseudo_fine_grain && gpu_ext_fine_grained_segment_.handle) ? gpu_ext_fine_grained_segment_
: (atomics && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_
: gpuvm_segment_;
(flags.pseudo_fine_grain_ && gpu_ext_fine_grained_segment_.handle)
? gpu_ext_fine_grained_segment_
: (flags.atomics_ && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_
: gpuvm_segment_;
if (pool.handle == 0 || gpuvm_segment_max_alloc_ == 0) {
DevLogPrintfError("Invalid argument, pool_handle: 0x%x , max_alloc: %u \n", pool.handle,
@@ -2196,9 +2180,12 @@ void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain
}
uint32_t hsa_mem_flags = 0;
if (contiguous) {
if (flags.contiguous_) {
hsa_mem_flags = HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG;
}
if (flags.executable_) {
hsa_mem_flags |= HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
}
void* ptr = nullptr;
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, hsa_mem_flags, &ptr);
@@ -412,7 +412,8 @@ class Device : public NullDevice {
//! Gets free memory on a GPU device
virtual bool globalFreeMemory(size_t* freeMemory) const;
virtual void* hostAlloc(size_t size, size_t alignment,
MemorySegment mem_seg = MemorySegment::kNoAtomics) const;
MemorySegment mem_seg = MemorySegment::kNoAtomics,
const AgentInfo* agentInfo = nullptr) const; // nullptr uses default CPU agent
virtual void hostFree(void* ptr, size_t size = 0) const;
bool deviceAllowAccess(void* dst) const;
@@ -420,8 +421,9 @@ class Device : public NullDevice {
bool allowPeerAccess(device::Memory* memory) const;
void deviceVmemRelease(uint64_t mem_handle) const;
uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const;
void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain = false,
bool contiguous = false) const;
void* deviceLocalAlloc(size_t size,
const AllocationFlags& flags = AllocationFlags{}) const;
void* reserveMemory(size_t size, size_t alignment) const;
void releaseMemory(void* ptr, size_t size) const;
void memFree(void* ptr, size_t size) const;
@@ -463,9 +465,6 @@ class Device : public NullDevice {
//! Allocate host memory in terms of numa policy set by user
void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;
//! Allocate host memory from agent info
void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const;
//! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
//! return a new device pointer accessible by the GPU agent.
void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const;
@@ -854,9 +854,11 @@ bool Buffer::create(bool alloc_local) {
}
} else {
assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0,
(memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0,
(memFlags & ROCCLR_MEM_HSA_CONTIGUOUS) != 0);
amd::Device::AllocationFlags flags = {};
flags.atomics_ = (memFlags & CL_MEM_SVM_ATOMICS) != 0;
flags.pseudo_fine_grain_ = (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0;
flags.contiguous_ = (memFlags & ROCCLR_MEM_HSA_CONTIGUOUS) != 0;
deviceMemory_ = dev().deviceLocalAlloc(size(), flags);
}
owner()->setSvmPtr(deviceMemory_);
} else {
@@ -1582,7 +1582,9 @@ bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) {
if (mem_segment == Device::MemorySegment::kKernArg &&
(gpu_.dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) &&
gpu_.dev().info().largeBar_) {
pool_base_ = reinterpret_cast<address>(gpu_.dev().deviceLocalAlloc(pool_size_));
amd::Device::AllocationFlags flags = {};
flags.executable_ = true;
pool_base_ = reinterpret_cast<address>(gpu_.dev().deviceLocalAlloc(pool_size_, flags));
if (pool_base_ != nullptr) {
// @note Workaround first access penalty.
// KFD may update CPU page tables on the first CPU access