From e197aa83ba3310ee2cb72a5a626676c73ea86c5a Mon Sep 17 00:00:00 2001 From: SaleelK Date: Mon, 8 Sep 2025 12:21:30 -0700 Subject: [PATCH] SWDEV-543723 - Execute permission for kernArg buf (#728) - Refactor deviceLocalAlloc arguments - Refactor hostAlloc code, have cleaner interface - Kern args buffer need to have execute flag set as CP enforces this on certain newer HW. --- .../clr/hipamd/src/hip_graph_internal.cpp | 4 +- projects/clr/rocclr/device/device.hpp | 20 +++++-- projects/clr/rocclr/device/pal/paldevice.cpp | 3 +- projects/clr/rocclr/device/pal/paldevice.hpp | 3 +- projects/clr/rocclr/device/rocm/rocdevice.cpp | 53 +++++++------------ projects/clr/rocclr/device/rocm/rocdevice.hpp | 11 ++-- projects/clr/rocclr/device/rocm/rocmemory.cpp | 8 +-- .../clr/rocclr/device/rocm/rocvirtual.cpp | 4 +- 8 files changed, 57 insertions(+), 49 deletions(-) diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 4a7e735b00..e55f41c645 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -758,7 +758,9 @@ bool GraphKernelArgManager::AllocGraphKernargPool(size_t pool_size, amd::Device* // callback thread. device_ = device; if (device->info().largeBar_) { - graph_kernarg_base = reinterpret_cast
(device->deviceLocalAlloc(pool_size)); + amd::Device::AllocationFlags flags = {}; + flags.executable_ = true; + graph_kernarg_base = reinterpret_cast
(device->deviceLocalAlloc(pool_size, flags)); device_kernarg_pool_ = true; } else { graph_kernarg_base = reinterpret_cast
( diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 1b7bcd41d2..84558efcba 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1814,13 +1814,27 @@ class Device : public RuntimeObject { /** * @copydoc amd::Context::hostAlloc */ - virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const { + virtual void* hostAlloc(size_t size, size_t alignment, + MemorySegment mem_seg = kNoAtomics, + const void* agentInfo = nullptr) const { ShouldNotCallThis(); return NULL; } - virtual void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain = false, - bool contiguous = false) const { + //! Flags for deviceLocalAlloc method + typedef union { + struct { + uint32_t atomics_ : 1; //!< True if atomics support is required + uint32_t pseudo_fine_grain_ : 1; //!< True if pseudo fine grain memory is required + uint32_t contiguous_ : 1; //!< True if contiguous memory allocation is required + uint32_t executable_ : 1; //!< True if executable memory is required + uint32_t reserved_ : 28; //!< Reserved for future use + }; + uint32_t data_; + } AllocationFlags; + + virtual void* deviceLocalAlloc( + size_t size, const AllocationFlags& flags = AllocationFlags{}) const { ShouldNotCallThis(); return NULL; } diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp index dcf58d531c..0003cf04ea 100644 --- a/projects/clr/rocclr/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/device/pal/paldevice.cpp @@ -2420,7 +2420,8 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize, iDev()->CreateSamplerSrds(1, &samplerInfo, hwState); } -void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const { +void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg, + const void* agentInfo) const { // for discrete gpu, we only reserve,no commit yet. return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE); } diff --git a/projects/clr/rocclr/device/pal/paldevice.hpp b/projects/clr/rocclr/device/pal/paldevice.hpp index 7cca51e244..bff5fe15a7 100644 --- a/projects/clr/rocclr/device/pal/paldevice.hpp +++ b/projects/clr/rocclr/device/pal/paldevice.hpp @@ -535,7 +535,8 @@ class Device : public NullDevice { ) const; //! host memory alloc - virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const; + virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics, + const void* agentInfo = nullptr) const; //! SVM allocation virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 104f40f76e..29f5e45d39 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -2006,10 +2006,15 @@ hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg, } // ================================================================================================ -void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const { +void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg, + const AgentInfo* agentInfo) const { void* ptr = nullptr; - hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg); - hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr); + uint32_t memFlags = 0; + if (mem_seg == kKernArg) { + memFlags |= HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG; + } + hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, agentInfo); + hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, memFlags, &ptr); ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx," " numa_node = %d, mem_seg = %d", @@ -2029,32 +2034,11 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) co return ptr; } -// ================================================================================================ -void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const { - void* ptr = nullptr; - hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, &agentInfo); - hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr); - ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size); - if (stat != HSA_STATUS_SUCCESS) { - LogPrintfError("Fail allocation host memory with err %d", stat); - return nullptr; - } - - stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], nullptr, ptr); - if (stat != HSA_STATUS_SUCCESS) { - LogPrintfError("Fail hsa_amd_agents_allow_access with err %d", stat); - hostFree(ptr, size); - return nullptr; - } - - return ptr; -} - // ================================================================================================ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const { void* ptr = nullptr; #ifndef ROCCLR_SUPPORT_NUMA_POLICY - ptr = hostAlloc(size, alignment, mem_seg); + ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_); #else int mode = MPOL_DEFAULT; int maxNodes = numa_num_possible_nodes(); @@ -2077,14 +2061,14 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg // We only care about the first CPU node for (unsigned int i = 0; i < cpuCount; i++) { if ((1u << i) & *nodeMask->maskp) { - ptr = hostAgentAlloc(size, cpu_agents_[i], mem_seg); + ptr = hostAlloc(size, alignment, mem_seg, &cpu_agents_[i]); break; } } break; default: // All other modes fall back to default mode - ptr = hostAlloc(size, alignment, mem_seg); + ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_); } numa_free_cpumask(nodeMask); #endif // ROCCLR_SUPPORT_NUMA_POLICY @@ -2182,12 +2166,12 @@ void Device::releaseMemory(void* ptr, size_t size) const { } } -void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain, - bool contiguous) const { +void* Device::deviceLocalAlloc(size_t size, const AllocationFlags& flags) const { const hsa_amd_memory_pool_t& pool = - (pseudo_fine_grain && gpu_ext_fine_grained_segment_.handle) ? gpu_ext_fine_grained_segment_ - : (atomics && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_ - : gpuvm_segment_; + (flags.pseudo_fine_grain_ && gpu_ext_fine_grained_segment_.handle) + ? gpu_ext_fine_grained_segment_ + : (flags.atomics_ && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_ + : gpuvm_segment_; if (pool.handle == 0 || gpuvm_segment_max_alloc_ == 0) { DevLogPrintfError("Invalid argument, pool_handle: 0x%x , max_alloc: %u \n", pool.handle, @@ -2196,9 +2180,12 @@ void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain } uint32_t hsa_mem_flags = 0; - if (contiguous) { + if (flags.contiguous_) { hsa_mem_flags = HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG; } + if (flags.executable_) { + hsa_mem_flags |= HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG; + } void* ptr = nullptr; hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, hsa_mem_flags, &ptr); diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 615ff31502..af709ff252 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -412,7 +412,8 @@ class Device : public NullDevice { //! Gets free memory on a GPU device virtual bool globalFreeMemory(size_t* freeMemory) const; virtual void* hostAlloc(size_t size, size_t alignment, - MemorySegment mem_seg = MemorySegment::kNoAtomics) const; + MemorySegment mem_seg = MemorySegment::kNoAtomics, + const AgentInfo* agentInfo = nullptr) const; // nullptr uses default CPU agent virtual void hostFree(void* ptr, size_t size = 0) const; bool deviceAllowAccess(void* dst) const; @@ -420,8 +421,9 @@ class Device : public NullDevice { bool allowPeerAccess(device::Memory* memory) const; void deviceVmemRelease(uint64_t mem_handle) const; uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const; - void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain = false, - bool contiguous = false) const; + + void* deviceLocalAlloc(size_t size, + const AllocationFlags& flags = AllocationFlags{}) const; void* reserveMemory(size_t size, size_t alignment) const; void releaseMemory(void* ptr, size_t size) const; void memFree(void* ptr, size_t size) const; @@ -463,9 +465,6 @@ class Device : public NullDevice { //! Allocate host memory in terms of numa policy set by user void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const; - //! Allocate host memory from agent info - void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const; - //! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and //! return a new device pointer accessible by the GPU agent. void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const; diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp index d337f92985..136b8f69b0 100644 --- a/projects/clr/rocclr/device/rocm/rocmemory.cpp +++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp @@ -854,9 +854,11 @@ bool Buffer::create(bool alloc_local) { } } else { assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!"); - deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0, - (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0, - (memFlags & ROCCLR_MEM_HSA_CONTIGUOUS) != 0); + amd::Device::AllocationFlags flags = {}; + flags.atomics_ = (memFlags & CL_MEM_SVM_ATOMICS) != 0; + flags.pseudo_fine_grain_ = (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0; + flags.contiguous_ = (memFlags & ROCCLR_MEM_HSA_CONTIGUOUS) != 0; + deviceMemory_ = dev().deviceLocalAlloc(size(), flags); } owner()->setSvmPtr(deviceMemory_); } else { diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index a9d1d58762..5135a49afc 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1582,7 +1582,9 @@ bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) { if (mem_segment == Device::MemorySegment::kKernArg && (gpu_.dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) && gpu_.dev().info().largeBar_) { - pool_base_ = reinterpret_cast
(gpu_.dev().deviceLocalAlloc(pool_size_)); + amd::Device::AllocationFlags flags = {}; + flags.executable_ = true; + pool_base_ = reinterpret_cast
(gpu_.dev().deviceLocalAlloc(pool_size_, flags)); if (pool_base_ != nullptr) { // @note Workaround first access penalty. // KFD may update CPU page tables on the first CPU access