SWDEV-270013 - Allocate kernel_arguments from kern_arg & finegrain pool instead of coarse grain.

Change-Id: Id4c6977934fdd6ef2311f6e75593801f1e51983c [ROCm/clr commit: 2df099df9e]
2021-02-02 12:22:47 -05:00
parent 4da1282882
commit 4583cbafee
11 changed files with 54 additions and 22 deletions
@@ -1486,6 +1486,12 @@ class Device : public RuntimeObject {
    kLinkAtomicSupport
  } LinkAttribute;

+  typedef enum MemorySegment {
+    kNoAtomics = 0,
+    kAtomics = 1,
+    kKernArg = 2
+  } MemorySegment;
+
  typedef std::pair<LinkAttribute, int32_t /* value */> LinkAttrType;

  static constexpr size_t kP2PStagingSize = 4 * Mi;
@@ -1624,7 +1630,8 @@ class Device : public RuntimeObject {
  /**
   * @copydoc amd::Context::hostAlloc
   */
-  virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const {
+  virtual void* hostAlloc(size_t size, size_t alignment,
+                          MemorySegment mem_seg = kNoAtomics) const {
    ShouldNotCallThis();
    return NULL;
  }
@@ -2144,7 +2144,7 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize,
                     hwStateSize);
 }

-void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
+void* Device::hostAlloc(size_t size, size_t alignment, bool MemorySegment mem_seg) const {
  // for discrete gpu, we only reserve,no commit yet.
  return amd::Os::reserveMemory(NULL, size, alignment, amd::Os::MEM_PROT_NONE);
 }
@@ -523,7 +523,7 @@ class Device : public NullDevice, public CALGSLDevice {
                     ) const;

  //! host memory alloc
-  virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
+  virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const;

  //! SVM allocation
  virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
@@ -2112,7 +2112,7 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize,
  iDev()->CreateSamplerSrds(1, &samplerInfo, hwState);
 }

-void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
+void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
  // for discrete gpu, we only reserve,no commit yet.
  return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE);
 }
@@ -473,7 +473,7 @@ class Device : public NullDevice {
                     ) const;

  //! host memory alloc
-  virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
+  virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const;

  //! SVM allocation
  virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
@@ -550,7 +550,8 @@ bool PerfCounterProfile::initialize() {
  }

  if (cmd_buf.ptr == nullptr) {
-    void *buf_ptr = roc_device_.hostAlloc(profile_.command_buffer.size, alignment, 1);
+    void *buf_ptr = roc_device_.hostAlloc(profile_.command_buffer.size, alignment,
+                                          Device::MemorySegment::kAtomics);
    if (buf_ptr != nullptr) {
      profile_.command_buffer.ptr = buf_ptr;
    }
@@ -565,7 +566,8 @@ bool PerfCounterProfile::initialize() {
  }

  if (out_buf.ptr == nullptr) {
-    void *buf_ptr = roc_device_.hostAlloc(profile_.output_buffer.size, alignment, 1);
+    void *buf_ptr = roc_device_.hostAlloc(profile_.output_buffer.size, alignment,
+                                          Device::MemorySegment::kAtomics);
    if (buf_ptr != nullptr) {
      profile_.output_buffer.ptr = buf_ptr;
    }
@@ -1845,12 +1845,27 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
 }

 // ================================================================================================
-void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
+void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
  void* ptr = nullptr;
-  // If runtime disables barrier, then all host allocations must have L2 disabled
-  const hsa_amd_memory_pool_t segment = (!atomics && settings().barrier_sync_)
-      ? (system_coarse_segment_.handle != 0) ? system_coarse_segment_ : system_segment_
-      : system_segment_;
+
+  hsa_amd_memory_pool_t segment;
+  switch (mem_seg) {
+    case kKernArg :
+    case kNoAtomics :
+      // If runtime disables barrier, then all host allocations must have L2 disabled
+      if ((settings().barrier_sync_) && (system_coarse_segment_.handle != 0)) {
+        segment = system_coarse_segment_;
+        break;
+      }
+      // Falls through on else case.
+    case kAtomics :
+      segment = system_segment_;
+      break;
+    default :
+      guarantee(false && "Invalid Memory Segment");
+      break;
+  }
+
  assert(segment.handle != 0);
  hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
  ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
@@ -1900,7 +1915,8 @@ void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, bool atomi
 void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const {
  void* ptr = nullptr;
 #ifndef ROCCLR_SUPPORT_NUMA_POLICY
-  ptr = hostAlloc(size, alignment, atomics);
+  ptr = hostAlloc(size, alignment, atomics
+                  ? Device::MemorySegment::kAtomics : Device::MemorySegment::kNoAtomics);
 #else
  int mode = MPOL_DEFAULT;
  unsigned long nodeMask = 0;
@@ -1930,7 +1946,8 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const {
      break;
    default:
      //  All other modes fall back to default mode
-      ptr = hostAlloc(size, alignment, atomics);
+      ptr = hostAlloc(size, alignment, atomics
+                      ? Device::MemorySegment::kAtomics : Device::MemorySegment::kNoAtomics);
  }
 #endif // ROCCLR_SUPPORT_NUMA_POLICY
  return ptr;
@@ -363,7 +363,8 @@ class Device : public NullDevice {
  //! Gets free memory on a GPU device
  virtual bool globalFreeMemory(size_t* freeMemory) const;

-  virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
+  virtual void* hostAlloc(size_t size, size_t alignment,
+                          MemorySegment mem_seg = MemorySegment::kNoAtomics) const;

  virtual void hostFree(void* ptr, size_t size = 0) const;

@@ -696,7 +696,7 @@ void Buffer::destroy() {
 // ================================================================================================
 bool Buffer::create() {
  if (owner() == nullptr) {
-    deviceMemory_ = dev().hostAlloc(size(), 1, false);
+    deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
    if (deviceMemory_ != nullptr) {
      flags_ |= HostMemoryDirectAccess;
      return true;
@@ -730,12 +730,14 @@ bool Buffer::create() {
          // GPU accessible or prefetch memory into GPU
          dev().SvmAllocInit(deviceMemory_, size());
 #else
-          deviceMemory_ = dev().hostAlloc(size(), 1, false);
+          deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
 #endif // AMD_HMM_SUPPORT
        } else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) {
          deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0);
        } else {
-          deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0);
+          deviceMemory_ = dev().hostAlloc(size(), 1, ((memFlags & CL_MEM_SVM_ATOMICS) != 0)
+                                                       ? Device::MemorySegment::kAtomics
+                                                       : Device::MemorySegment::kNoAtomics);
        }
      } else {
        assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
@@ -810,7 +812,7 @@ bool Buffer::create() {
        return true;
      }

-      deviceMemory_ = dev().hostAlloc(size(), 1, false);
+      deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
      owner()->setHostMem(deviceMemory_);

      if ((deviceMemory_ != nullptr) && dev().settings().apuSystem_) {
@@ -1102,7 +1104,7 @@ bool Image::create() {
  }

  if (originalDeviceMemory_ == nullptr) {
-    originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, false);
+    originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, Device::MemorySegment::kNoAtomics);
    if ((originalDeviceMemory_ != nullptr) && dev().settings().apuSystem_) {
      const_cast<Device&>(dev()).updateFreeMemory(alloc_size, false);
    }
@@ -1041,7 +1041,8 @@ bool VirtualGPU::create() {
 // ================================================================================================
 bool VirtualGPU::initPool(size_t kernarg_pool_size) {
  kernarg_pool_size_ = kernarg_pool_size;
-  kernarg_pool_base_ = reinterpret_cast<char*>(roc_device_.hostAlloc(kernarg_pool_size_, false));
+  kernarg_pool_base_ = reinterpret_cast<char*>(roc_device_.hostAlloc(kernarg_pool_size_, 0,
+                                               Device::MemorySegment::kKernArg));
  if (kernarg_pool_base_ == nullptr) {
    return false;
  }
@@ -284,7 +284,9 @@ int Context::create(const intptr_t* properties) {

 void* Context::hostAlloc(size_t size, size_t alignment, bool atomics) const {
  if (customHostAllocDevice_ != NULL) {
-    return customHostAllocDevice_->hostAlloc(size, alignment, atomics);
+    return customHostAllocDevice_->hostAlloc(size, alignment, atomics
+                                             ? Device::MemorySegment::kAtomics
+                                             : Device::MemorySegment::kNoAtomics);
  }
  return AlignedMemory::allocate(size, alignment);
 }