From e197aa83ba3310ee2cb72a5a626676c73ea86c5a Mon Sep 17 00:00:00 2001
From: SaleelK <Saleel.Kudchadker@amd.com>
Date: Mon, 8 Sep 2025 12:21:30 -0700
Subject: [PATCH] SWDEV-543723 - Execute permission for kernArg buf (#728)

- Refactor deviceLocalAlloc arguments
- Refactor hostAlloc code, have cleaner interface
- Kern args buffer need to have execute flag set as CP enforces this on
  certain newer HW.
---
 .../clr/hipamd/src/hip_graph_internal.cpp     |  4 +-
 projects/clr/rocclr/device/device.hpp         | 20 +++++--
 projects/clr/rocclr/device/pal/paldevice.cpp  |  3 +-
 projects/clr/rocclr/device/pal/paldevice.hpp  |  3 +-
 projects/clr/rocclr/device/rocm/rocdevice.cpp | 53 +++++++------------
 projects/clr/rocclr/device/rocm/rocdevice.hpp | 11 ++--
 projects/clr/rocclr/device/rocm/rocmemory.cpp |  8 +--
 .../clr/rocclr/device/rocm/rocvirtual.cpp     |  4 +-
 8 files changed, 57 insertions(+), 49 deletions(-)
diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp
index 4a7e735b00..e55f41c645 100644
--- a/projects/clr/hipamd/src/hip_graph_internal.cpp
+++ b/projects/clr/hipamd/src/hip_graph_internal.cpp
@@ -758,7 +758,9 @@ bool GraphKernelArgManager::AllocGraphKernargPool(size_t pool_size, amd::Device*
   // callback thread.
   device_ = device;
   if (device->info().largeBar_) {
-    graph_kernarg_base = reinterpret_cast<address>(device->deviceLocalAlloc(pool_size));
+    amd::Device::AllocationFlags flags = {};
+    flags.executable_ = true;
+    graph_kernarg_base = reinterpret_cast<address>(device->deviceLocalAlloc(pool_size, flags));
     device_kernarg_pool_ = true;
   } else {
     graph_kernarg_base = reinterpret_cast<address>(
diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index 1b7bcd41d2..84558efcba 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -1814,13 +1814,27 @@ class Device : public RuntimeObject {
   /**
    * @copydoc amd::Context::hostAlloc
    */
-  virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const {
+  virtual void* hostAlloc(size_t size, size_t alignment,
+                          MemorySegment mem_seg = kNoAtomics,
+                          const void* agentInfo = nullptr) const {
     ShouldNotCallThis();
     return NULL;
   }
 
-  virtual void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain = false,
-                                 bool contiguous = false) const {
+  //! Flags for deviceLocalAlloc method
+  typedef union {
+    struct {
+      uint32_t atomics_ : 1;           //!< True if atomics support is required
+      uint32_t pseudo_fine_grain_ : 1; //!< True if pseudo fine grain memory is required
+      uint32_t contiguous_ : 1;        //!< True if contiguous memory allocation is required
+      uint32_t executable_ : 1;        //!< True if executable memory is required
+      uint32_t reserved_ : 28;         //!< Reserved for future use
+    };
+    uint32_t data_;
+  } AllocationFlags;
+
+  virtual void* deviceLocalAlloc(
+      size_t size, const AllocationFlags& flags = AllocationFlags{}) const {
     ShouldNotCallThis();
     return NULL;
   }
diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp
index dcf58d531c..0003cf04ea 100644
--- a/projects/clr/rocclr/device/pal/paldevice.cpp
+++ b/projects/clr/rocclr/device/pal/paldevice.cpp
@@ -2420,7 +2420,8 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize,
   iDev()->CreateSamplerSrds(1, &samplerInfo, hwState);
 }
 
-void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
+void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg,
+                        const void* agentInfo) const {
   // for discrete gpu, we only reserve,no commit yet.
   return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE);
 }
diff --git a/projects/clr/rocclr/device/pal/paldevice.hpp b/projects/clr/rocclr/device/pal/paldevice.hpp
index 7cca51e244..bff5fe15a7 100644
--- a/projects/clr/rocclr/device/pal/paldevice.hpp
+++ b/projects/clr/rocclr/device/pal/paldevice.hpp
@@ -535,7 +535,8 @@ class Device : public NullDevice {
   ) const;
 
   //! host memory alloc
-  virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const;
+  virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics,
+                          const void* agentInfo = nullptr) const;
 
   //! SVM allocation
   virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 104f40f76e..29f5e45d39 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -2006,10 +2006,15 @@ hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg,
 }
 
 // ================================================================================================
-void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
+void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg,
+                        const AgentInfo* agentInfo) const {
   void* ptr = nullptr;
-  hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg);
-  hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
+  uint32_t memFlags = 0;
+  if (mem_seg == kKernArg) {
+    memFlags |= HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
+  }
+  hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, agentInfo);
+  hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, memFlags, &ptr);
   ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
           "Allocate hsa host memory %p, size 0x%zx,"
           " numa_node = %d, mem_seg = %d",
@@ -2029,32 +2034,11 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) co
   return ptr;
 }
 
-// ================================================================================================
-void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const {
-  void* ptr = nullptr;
-  hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, &agentInfo);
-  hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
-  ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
-  if (stat != HSA_STATUS_SUCCESS) {
-    LogPrintfError("Fail allocation host memory with err %d", stat);
-    return nullptr;
-  }
-
-  stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], nullptr, ptr);
-  if (stat != HSA_STATUS_SUCCESS) {
-    LogPrintfError("Fail hsa_amd_agents_allow_access with err %d", stat);
-    hostFree(ptr, size);
-    return nullptr;
-  }
-
-  return ptr;
-}
-
 // ================================================================================================
 void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
   void* ptr = nullptr;
 #ifndef ROCCLR_SUPPORT_NUMA_POLICY
-  ptr = hostAlloc(size, alignment, mem_seg);
+  ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
 #else
   int mode = MPOL_DEFAULT;
   int maxNodes = numa_num_possible_nodes();
@@ -2077,14 +2061,14 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
       // We only care about the first CPU node
       for (unsigned int i = 0; i < cpuCount; i++) {
         if ((1u << i) & *nodeMask->maskp) {
-          ptr = hostAgentAlloc(size, cpu_agents_[i], mem_seg);
+          ptr = hostAlloc(size, alignment, mem_seg, &cpu_agents_[i]);
           break;
         }
       }
       break;
     default:
       //  All other modes fall back to default mode
-      ptr = hostAlloc(size, alignment, mem_seg);
+      ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
   }
   numa_free_cpumask(nodeMask);
 #endif  // ROCCLR_SUPPORT_NUMA_POLICY
@@ -2182,12 +2166,12 @@ void Device::releaseMemory(void* ptr, size_t size) const {
   }
 }
 
-void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain,
-                               bool contiguous) const {
+void* Device::deviceLocalAlloc(size_t size, const AllocationFlags& flags) const {
   const hsa_amd_memory_pool_t& pool =
-      (pseudo_fine_grain && gpu_ext_fine_grained_segment_.handle) ? gpu_ext_fine_grained_segment_
-      : (atomics && gpu_fine_grained_segment_.handle)             ? gpu_fine_grained_segment_
-                                                                  : gpuvm_segment_;
+      (flags.pseudo_fine_grain_ && gpu_ext_fine_grained_segment_.handle)
+          ? gpu_ext_fine_grained_segment_
+      : (flags.atomics_ && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_
+                                                             : gpuvm_segment_;
 
   if (pool.handle == 0 || gpuvm_segment_max_alloc_ == 0) {
     DevLogPrintfError("Invalid argument, pool_handle: 0x%x , max_alloc: %u \n", pool.handle,
@@ -2196,9 +2180,12 @@ void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain
   }
 
   uint32_t hsa_mem_flags = 0;
-  if (contiguous) {
+  if (flags.contiguous_) {
     hsa_mem_flags = HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG;
   }
+  if (flags.executable_) {
+    hsa_mem_flags |= HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
+  }
 
   void* ptr = nullptr;
   hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, hsa_mem_flags, &ptr);
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index 615ff31502..af709ff252 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -412,7 +412,8 @@ class Device : public NullDevice {
   //! Gets free memory on a GPU device
   virtual bool globalFreeMemory(size_t* freeMemory) const;
   virtual void* hostAlloc(size_t size, size_t alignment,
-                          MemorySegment mem_seg = MemorySegment::kNoAtomics) const;
+                          MemorySegment mem_seg = MemorySegment::kNoAtomics,
+                          const AgentInfo* agentInfo = nullptr) const;  // nullptr uses default CPU agent
   virtual void hostFree(void* ptr, size_t size = 0) const;
 
   bool deviceAllowAccess(void* dst) const;
@@ -420,8 +421,9 @@ class Device : public NullDevice {
   bool allowPeerAccess(device::Memory* memory) const;
   void deviceVmemRelease(uint64_t mem_handle) const;
   uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const;
-  void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain = false,
-                         bool contiguous = false) const;
+
+  void* deviceLocalAlloc(size_t size,
+                        const AllocationFlags& flags = AllocationFlags{}) const;
   void* reserveMemory(size_t size, size_t alignment) const;
   void releaseMemory(void* ptr, size_t size) const;
   void memFree(void* ptr, size_t size) const;
@@ -463,9 +465,6 @@ class Device : public NullDevice {
   //! Allocate host memory in terms of numa policy set by user
   void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;
 
-  //! Allocate host memory from agent info
-  void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const;
-
   //! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
   //! return a new device pointer accessible by the GPU agent.
   void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const;
diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp
index d337f92985..136b8f69b0 100644
--- a/projects/clr/rocclr/device/rocm/rocmemory.cpp
+++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp
@@ -854,9 +854,11 @@ bool Buffer::create(bool alloc_local) {
         }
       } else {
         assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
-        deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0,
-                                               (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0,
-                                               (memFlags & ROCCLR_MEM_HSA_CONTIGUOUS) != 0);
+        amd::Device::AllocationFlags flags = {};
+        flags.atomics_ = (memFlags & CL_MEM_SVM_ATOMICS) != 0;
+        flags.pseudo_fine_grain_ = (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0;
+        flags.contiguous_ = (memFlags & ROCCLR_MEM_HSA_CONTIGUOUS) != 0;
+        deviceMemory_ = dev().deviceLocalAlloc(size(), flags);
       }
       owner()->setSvmPtr(deviceMemory_);
     } else {
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index a9d1d58762..5135a49afc 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -1582,7 +1582,9 @@ bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) {
   if (mem_segment == Device::MemorySegment::kKernArg &&
       (gpu_.dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) &&
       gpu_.dev().info().largeBar_) {
-    pool_base_ = reinterpret_cast<address>(gpu_.dev().deviceLocalAlloc(pool_size_));
+    amd::Device::AllocationFlags flags = {};
+    flags.executable_ = true;
+    pool_base_ = reinterpret_cast<address>(gpu_.dev().deviceLocalAlloc(pool_size_, flags));
     if (pool_base_ != nullptr) {
       // @note Workaround first access penalty.
       // KFD may update CPU page tables on the first CPU access