SWDEV-543723 - Execute permission for kernArg buf (#728)

- Refactor deviceLocalAlloc arguments - Refactor hostAlloc code, have cleaner interface - Kern args buffer need to have execute flag set as CP enforces this on certain newer HW.
2025-09-08 12:21:30 -07:00
@@ -758,7 +758,9 @@ bool GraphKernelArgManager::AllocGraphKernargPool(size_t pool_size, amd::Device*
  // callback thread.
  device_ = device;
  if (device->info().largeBar_) {
-    graph_kernarg_base = reinterpret_cast<address>(device->deviceLocalAlloc(pool_size));
+    amd::Device::AllocationFlags flags = {};
+    flags.executable_ = true;
+    graph_kernarg_base = reinterpret_cast<address>(device->deviceLocalAlloc(pool_size, flags));
    device_kernarg_pool_ = true;
  } else {
    graph_kernarg_base = reinterpret_cast<address>(
@@ -1814,13 +1814,27 @@ class Device : public RuntimeObject {
  /**
   * @copydoc amd::Context::hostAlloc
   */
-  virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const {
+  virtual void* hostAlloc(size_t size, size_t alignment,
+                          MemorySegment mem_seg = kNoAtomics,
+                          const void* agentInfo = nullptr) const {
    ShouldNotCallThis();
    return NULL;
  }

-  virtual void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain = false,
-                                 bool contiguous = false) const {
+  //! Flags for deviceLocalAlloc method
+  typedef union {
+    struct {
+      uint32_t atomics_ : 1;           //!< True if atomics support is required
+      uint32_t pseudo_fine_grain_ : 1; //!< True if pseudo fine grain memory is required
+      uint32_t contiguous_ : 1;        //!< True if contiguous memory allocation is required
+      uint32_t executable_ : 1;        //!< True if executable memory is required
+      uint32_t reserved_ : 28;         //!< Reserved for future use
+    };
+    uint32_t data_;
+  } AllocationFlags;
+
+  virtual void* deviceLocalAlloc(
+      size_t size, const AllocationFlags& flags = AllocationFlags{}) const {
    ShouldNotCallThis();
    return NULL;
  }
@@ -2420,7 +2420,8 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize,
  iDev()->CreateSamplerSrds(1, &samplerInfo, hwState);
 }

-void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
+void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg,
+                        const void* agentInfo) const {
  // for discrete gpu, we only reserve,no commit yet.
  return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE);
 }
@@ -535,7 +535,8 @@ class Device : public NullDevice {
  ) const;

  //! host memory alloc
-  virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const;
+  virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics,
+                          const void* agentInfo = nullptr) const;

  //! SVM allocation
  virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
@@ -2006,10 +2006,15 @@ hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg,
 }

 // ================================================================================================
-void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
+void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg,
+                        const AgentInfo* agentInfo) const {
  void* ptr = nullptr;
-  hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg);
-  hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
+  uint32_t memFlags = 0;
+  if (mem_seg == kKernArg) {
+    memFlags |= HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
+  }
+  hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, agentInfo);
+  hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, memFlags, &ptr);
  ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
          "Allocate hsa host memory %p, size 0x%zx,"
          " numa_node = %d, mem_seg = %d",
@@ -2029,32 +2034,11 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) co
  return ptr;
 }

-// ================================================================================================
-void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const {
-  void* ptr = nullptr;
-  hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, &agentInfo);
-  hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
-  ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
-  if (stat != HSA_STATUS_SUCCESS) {
-    LogPrintfError("Fail allocation host memory with err %d", stat);
-    return nullptr;
-  }
-
-  stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], nullptr, ptr);
-  if (stat != HSA_STATUS_SUCCESS) {
-    LogPrintfError("Fail hsa_amd_agents_allow_access with err %d", stat);
-    hostFree(ptr, size);
-    return nullptr;
-  }
-
-  return ptr;
-}
-
 // ================================================================================================
 void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
  void* ptr = nullptr;
 #ifndef ROCCLR_SUPPORT_NUMA_POLICY
-  ptr = hostAlloc(size, alignment, mem_seg);
+  ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
 #else
  int mode = MPOL_DEFAULT;
  int maxNodes = numa_num_possible_nodes();
@@ -2077,14 +2061,14 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
      // We only care about the first CPU node
      for (unsigned int i = 0; i < cpuCount; i++) {
        if ((1u << i) & *nodeMask->maskp) {
-          ptr = hostAgentAlloc(size, cpu_agents_[i], mem_seg);
+          ptr = hostAlloc(size, alignment, mem_seg, &cpu_agents_[i]);
          break;
        }
      }
      break;
    default:
      //  All other modes fall back to default mode
-      ptr = hostAlloc(size, alignment, mem_seg);
+      ptr = hostAlloc(size, alignment, mem_seg, cpu_agent_info_);
  }
  numa_free_cpumask(nodeMask);
 #endif  // ROCCLR_SUPPORT_NUMA_POLICY
@@ -2182,12 +2166,12 @@ void Device::releaseMemory(void* ptr, size_t size) const {
  }
 }

-void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain,
-                               bool contiguous) const {
+void* Device::deviceLocalAlloc(size_t size, const AllocationFlags& flags) const {
  const hsa_amd_memory_pool_t& pool =
-      (pseudo_fine_grain && gpu_ext_fine_grained_segment_.handle) ? gpu_ext_fine_grained_segment_
-      : (atomics && gpu_fine_grained_segment_.handle)             ? gpu_fine_grained_segment_
-                                                                  : gpuvm_segment_;
+      (flags.pseudo_fine_grain_ && gpu_ext_fine_grained_segment_.handle)
+          ? gpu_ext_fine_grained_segment_
+      : (flags.atomics_ && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_
+                                                             : gpuvm_segment_;

  if (pool.handle == 0 || gpuvm_segment_max_alloc_ == 0) {
    DevLogPrintfError("Invalid argument, pool_handle: 0x%x , max_alloc: %u \n", pool.handle,
@@ -2196,9 +2180,12 @@ void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain
  }

  uint32_t hsa_mem_flags = 0;
-  if (contiguous) {
+  if (flags.contiguous_) {
    hsa_mem_flags = HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG;
  }
+  if (flags.executable_) {
+    hsa_mem_flags |= HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
+  }

  void* ptr = nullptr;
  hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, hsa_mem_flags, &ptr);
@@ -412,7 +412,8 @@ class Device : public NullDevice {
  //! Gets free memory on a GPU device
  virtual bool globalFreeMemory(size_t* freeMemory) const;
  virtual void* hostAlloc(size_t size, size_t alignment,
-                          MemorySegment mem_seg = MemorySegment::kNoAtomics) const;
+                          MemorySegment mem_seg = MemorySegment::kNoAtomics,
+                          const AgentInfo* agentInfo = nullptr) const;  // nullptr uses default CPU agent
  virtual void hostFree(void* ptr, size_t size = 0) const;

  bool deviceAllowAccess(void* dst) const;
@@ -420,8 +421,9 @@ class Device : public NullDevice {
  bool allowPeerAccess(device::Memory* memory) const;
  void deviceVmemRelease(uint64_t mem_handle) const;
  uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const;
-  void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain = false,
-                         bool contiguous = false) const;
+
+  void* deviceLocalAlloc(size_t size,
+                        const AllocationFlags& flags = AllocationFlags{}) const;
  void* reserveMemory(size_t size, size_t alignment) const;
  void releaseMemory(void* ptr, size_t size) const;
  void memFree(void* ptr, size_t size) const;
@@ -463,9 +465,6 @@ class Device : public NullDevice {
  //! Allocate host memory in terms of numa policy set by user
  void* hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const;

-  //! Allocate host memory from agent info
-  void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const;
-
  //! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
  //! return a new device pointer accessible by the GPU agent.
  void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const;
@@ -854,9 +854,11 @@ bool Buffer::create(bool alloc_local) {
        }
      } else {
        assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
-        deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0,
-                                               (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0,
-                                               (memFlags & ROCCLR_MEM_HSA_CONTIGUOUS) != 0);
+        amd::Device::AllocationFlags flags = {};
+        flags.atomics_ = (memFlags & CL_MEM_SVM_ATOMICS) != 0;
+        flags.pseudo_fine_grain_ = (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0;
+        flags.contiguous_ = (memFlags & ROCCLR_MEM_HSA_CONTIGUOUS) != 0;
+        deviceMemory_ = dev().deviceLocalAlloc(size(), flags);
      }
      owner()->setSvmPtr(deviceMemory_);
    } else {
@@ -1582,7 +1582,9 @@ bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) {
  if (mem_segment == Device::MemorySegment::kKernArg &&
      (gpu_.dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) &&
      gpu_.dev().info().largeBar_) {
-    pool_base_ = reinterpret_cast<address>(gpu_.dev().deviceLocalAlloc(pool_size_));
+    amd::Device::AllocationFlags flags = {};
+    flags.executable_ = true;
+    pool_base_ = reinterpret_cast<address>(gpu_.dev().deviceLocalAlloc(pool_size_, flags));
    if (pool_base_ != nullptr) {
      // @note Workaround first access penalty.
      // KFD may update CPU page tables on the first CPU access