SWDEV-456279 - Adding new hip flag to access contiguous memory and pass the flag to HSA API.

Change-Id: I1bafeaa3096395c729723af958d609bc41e7845c [ROCm/clr commit: 1d48f2a1ab]
2024-04-12 08:58:46 +00:00
@@ -589,6 +589,8 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
    ihipFlags = CL_MEM_SVM_ATOMICS;
  } else if (flags == hipDeviceMallocUncached) {
    ihipFlags = CL_MEM_SVM_ATOMICS | ROCCLR_MEM_HSA_UNCACHED;
+  } else if (flags == hipDeviceMallocContiguous) {
+    ihipFlags = ROCCLR_MEM_HSA_CONTIGUOUS | ROCCLR_MEM_HSA_UNCACHED;
  } else if (flags == hipMallocSignalMemory) {
    ihipFlags = CL_MEM_SVM_ATOMICS | CL_MEM_SVM_FINE_GRAIN_BUFFER | ROCCLR_MEM_HSA_SIGNAL_MEMORY;
    if (sizeBytes != 8) {
@@ -1772,7 +1772,7 @@ class Device : public RuntimeObject {
  }

  virtual void* deviceLocalAlloc(size_t size, bool atomics = false,
-                                 bool pseudo_fine_grain = false) const {
+                                 bool pseudo_fine_grain = false, bool contiguous = false) const {
    ShouldNotCallThis();
    return NULL;
  }
@@ -2331,7 +2331,8 @@ void Device::deviceVmemRelease(uint64_t mem_handle) const {
  }
 }

-void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain) const {
+void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain,
+                               bool contiguous) const {
  const hsa_amd_memory_pool_t& pool = (pseudo_fine_grain) ? gpu_ext_fine_grained_segment_
                                      : (atomics) ? gpu_fine_grained_segment_ : gpuvm_segment_;

@@ -2341,6 +2342,11 @@ void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain
    return nullptr;
  }

+  uint32_t hsa_mem_flags = 0;
+  if (contiguous) {
+    hsa_mem_flags = HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG;
+  }
+
  void* ptr = nullptr;
  hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
  ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa device memory %p, size 0x%zx", ptr, size);
@@ -453,7 +453,8 @@ class Device : public NullDevice {
  bool allowPeerAccess(device::Memory* memory) const;
  void deviceVmemRelease(uint64_t mem_handle) const;
  uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const;
-  void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain=false) const;
+  void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain=false,
+                         bool contiguous = false) const;

  void memFree(void* ptr, size_t size) const;

@@ -856,7 +856,8 @@ bool Buffer::create(bool alloc_local) {
      } else {
        assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
        deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0,
-                                               (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0);
+                                               (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0,
+                                               (memFlags & ROCCLR_MEM_HSA_CONTIGUOUS) != 0);
      }
      owner()->setSvmPtr(deviceMemory_);
    } else {
@@ -44,6 +44,7 @@
 #define ROCCLR_MEM_HSA_UNCACHED         (1u << 27)
 #define ROCCLR_MEM_INTERPROCESS         (1u << 26)
 #define ROCCLR_MEM_PHYMEM               (1u << 25)
+#define ROCCLR_MEM_HSA_CONTIGUOUS       (1u << 24)

 namespace device {
 class Memory;