Refactor: Consolidate calls to hsaKmtAllocMemory

Route all device-visible system memory allocations through system_allocator. Change-Id: I5e90a1bf491e432678a6d8ab1f9f3770734cbda1 [ROCm/ROCR-Runtime commit: 74f5aca93d]
2016-08-22 20:19:21 -05:00
@@ -43,6 +43,6 @@
 #include "core/common/shared.h"

 namespace core {
-std::function<void*(size_t, size_t)> BaseShared::allocate_=nullptr;
-std::function<void(void*)> BaseShared::free_=nullptr;
+std::function<void*(size_t, size_t, uint32_t)> BaseShared::allocate_ = nullptr;
+std::function<void(void*)> BaseShared::free_ = nullptr;
 }
@@ -55,14 +55,14 @@ namespace core {
 class BaseShared {
 public:
  static void SetAllocateAndFree(
-      const std::function<void*(size_t, size_t)>& allocate,
+      const std::function<void*(size_t, size_t, uint32_t)>& allocate,
      const std::function<void(void*)>& free) {
    allocate_ = allocate;
    free_ = free;
  }

 protected:
-  static std::function<void*(size_t, size_t)> allocate_;
+  static std::function<void*(size_t, size_t, uint32_t)> allocate_;
  static std::function<void(void*)> free_;
 };

@@ -78,7 +78,7 @@ class Shared : public BaseShared {
                  "Align is less than alignof(T)");

    shared_object_ =
-        reinterpret_cast<T*>(allocate_(sizeof(T), Max(__alignof(T), Align)));
+        reinterpret_cast<T*>(allocate_(sizeof(T), Max(__alignof(T), Align), 0));

    assert(shared_object_ != NULL && "Failed on allocating shared_object_");

@@ -49,7 +49,6 @@

 #include <vector>

-#include "core/inc/runtime.h"
 #include "core/inc/checked.h"
 #include "core/inc/isa.h"
 #include "core/inc/queue.h"
@@ -44,6 +44,7 @@
 #define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_

 #include <map>
+#include <mutex>
 #include <stdint.h>

 #include "core/inc/blit.h"
@@ -99,9 +99,7 @@ class MemoryRegion : public core::MemoryRegion {

  ~MemoryRegion();

-  hsa_status_t Allocate(size_t size, void** address) const;
-
-  hsa_status_t Allocate(bool restrict_access, size_t size,
+  hsa_status_t Allocate(size_t size, AllocateFlags alloc_flags,
                        void** address) const;

  hsa_status_t Free(void* address, size_t size) const;
@@ -43,7 +43,8 @@
 #ifndef HSA_RUNTME_CORE_INC_CHECKED_H_
 #define HSA_RUNTME_CORE_INC_CHECKED_H_

-#include "stdint.h"
+#include <stdint.h>
+#include <stdlib.h>

 namespace core {

@@ -47,7 +47,6 @@

 #include <vector>

-#include "core/inc/runtime.h"
 #include "core/inc/agent.h"
 #include "core/inc/checked.h"

@@ -81,7 +80,17 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
    return reinterpret_cast<MemoryRegion*>(region.handle);
  }

-  virtual hsa_status_t Allocate(size_t size, void** address) const = 0;
+  enum AllocateEnum {
+    AllocateNoFlags = 0,
+    AllocateRestrict = (1 << 0),    // Don't map system memory to GPU agents
+    AllocateExecutable = (1 << 1),  // Set executable permission
+    AllocateDoubleMap = (1 << 2),   // Map twice VA allocation to backing store
+  };
+
+  typedef uint32_t AllocateFlags;
+
+  virtual hsa_status_t Allocate(size_t size, AllocateFlags alloc_flags,
+                                void** address) const = 0;

  virtual hsa_status_t Free(void* address, size_t size) const = 0;

@@ -48,7 +48,6 @@

 #include "core/common/shared.h"

-#include "core/inc/runtime.h"
 #include "core/inc/checked.h"

 #include "core/util/utils.h"
@@ -151,25 +151,14 @@ class Runtime {
  ///
  /// @param [in] region Pointer to region object.
  /// @param [in] size Allocation size in bytes.
+  /// @param [in] alloc_flags Modifiers to pass to MemoryRegion allocator.
  /// @param [out] address Pointer to store the allocation result.
  ///
  /// @retval ::HSA_STATUS_SUCCESS If allocation is successful.
  hsa_status_t AllocateMemory(const MemoryRegion* region, size_t size,
+                              MemoryRegion::AllocateFlags alloc_flags,
                              void** address);

-  /// @brief Allocate memory on a particular region with option to restrict
-  /// access to the owning agent.
-  ///
-  /// @param [in] restrict_access If true, the allocation result would only be
-  /// accessible to the agent(s) that own the region object.
-  /// @param [in] region Pointer to region object.
-  /// @param [in] size Allocation size in bytes.
-  /// @param [out] address Pointer to store the allocation result.
-  ///
-  /// @retval ::HSA_STATUS_SUCCESS If allocation is successful.
-  hsa_status_t AllocateMemory(bool restrict_access, const MemoryRegion* region,
-                              size_t size, void** address);
-
  /// @brief Free memory previously allocated with AllocateMemory.
  ///
  /// @param [in] ptr Address of the memory to be freed.
@@ -292,7 +281,8 @@ class Runtime {

  amd::hsa::code::AmdHsaCodeManager* code_manager() { return &code_manager_; }

-  std::function<void*(size_t, size_t)>& system_allocator() {
+  std::function<void*(size_t, size_t, MemoryRegion::AllocateFlags)>&
+  system_allocator() {
    return system_allocator_;
  }

@@ -446,7 +436,8 @@ class Runtime {
  std::map<const void*, AllocationRegion> allocation_map_;

  // Allocator using ::system_region_
-  std::function<void*(size_t, size_t)> system_allocator_;
+  std::function<void*(size_t, size_t, MemoryRegion::AllocateFlags)>
+      system_allocator_;

  // Deallocator using ::system_region_
  std::function<void(void*)> system_deallocator_;
@@ -266,21 +266,12 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id,
  SignalGuard.Dismiss();
 #endif

-  HsaMemFlags pm4_ib_buf_flags = {0};
-  pm4_ib_buf_flags.ui32.HostAccess = 1;
-  pm4_ib_buf_flags.ui32.ExecuteAccess = 1;
-  pm4_ib_buf_flags.ui32.NoSubstitute = 1;
-
-  HSAKMT_STATUS err =
-      hsaKmtAllocMemory(agent_->node_id(), pm4_ib_size_b_, pm4_ib_buf_flags, &pm4_ib_buf_);
-  assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAllocMemory(PM4 IB) failed");
-
-  err = hsaKmtMapMemoryToGPU(pm4_ib_buf_, pm4_ib_size_b_, NULL);
-  assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtMapMemoryToGPU(PM4 IB) failed");
+  pm4_ib_buf_ = core::Runtime::runtime_singleton_->system_allocator()(
+      pm4_ib_size_b_, 0x1000, core::MemoryRegion::AllocateExecutable);
+  if (pm4_ib_buf_ == NULL) return;

  MAKE_NAMED_SCOPE_GUARD(PM4IBGuard, [&]() {
-    hsaKmtUnmapMemoryToGPU(pm4_ib_buf_);
-    hsaKmtFreeMemory(pm4_ib_buf_, pm4_ib_size_b_);
+    core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_);
  });

  valid_ = true;
@@ -314,8 +305,7 @@ AqlQueue::~AqlQueue() {
  }
 #endif

-  hsaKmtUnmapMemoryToGPU(pm4_ib_buf_);
-  hsaKmtFreeMemory(pm4_ib_buf_, pm4_ib_size_b_);
+  core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_);
 }

 uint64_t AqlQueue::LoadReadIndexAcquire() {
@@ -631,34 +621,19 @@ void AqlQueue::AllocRegisteredRingBuffer(uint32_t queue_size_pkts) {
 #endif
  } else {
    // Allocate storage for the ring buffer.
-    HsaMemFlags flags;
-    flags.Value = 0;
-    flags.ui32.HostAccess = 1;
-    flags.ui32.AtomicAccessPartial = 1;
-    flags.ui32.ExecuteAccess = 1;
-    flags.ui32.AQLQueueMemory = 1;
-
    ring_buf_alloc_bytes_ = AlignUp(
        queue_size_pkts * static_cast<uint32_t>(sizeof(core::AqlPacket)), 4096);
-    auto err = hsaKmtAllocMemory(agent_->node_id(), ring_buf_alloc_bytes_,
-                                 flags, (void**)&ring_buf_);

-    if (err != HSAKMT_STATUS_SUCCESS) {
-      assert(false && "AQL queue memory allocation failure.");
-      return;
-    }
+    ring_buf_ = core::Runtime::runtime_singleton_->system_allocator()(
+        ring_buf_alloc_bytes_, 0x1000,
+        core::MemoryRegion::AllocateExecutable |
+            core::MemoryRegion::AllocateDoubleMap);

-    HSAuint64 alternate_va;
-    err = hsaKmtMapMemoryToGPU(ring_buf_, ring_buf_alloc_bytes_, &alternate_va);
+    assert(ring_buf_ != NULL && "AQL queue memory allocation failure");

-    if (err != HSAKMT_STATUS_SUCCESS) {
-      assert(false && "AQL queue memory map failure.");
-      hsaKmtFreeMemory(ring_buf_, ring_buf_alloc_bytes_);
-      ring_buf_ = NULL;
-      return;
-    }
-
-    ring_buf_alloc_bytes_ = 2 * ring_buf_alloc_bytes_;
+    // The virtual ring allocation is twice as large as requested.
+    // Each half maps to the same set of physical pages.
+    ring_buf_alloc_bytes_ *= 2;
  }
 }

@@ -673,8 +648,7 @@ void AqlQueue::FreeRegisteredRingBuffer() {
        (void*)(uintptr_t(ring_buf_) + (ring_buf_alloc_bytes_ / 2)));
 #endif
  } else {
-    hsaKmtUnmapMemoryToGPU(ring_buf_);
-    hsaKmtFreeMemory(ring_buf_, ring_buf_alloc_bytes_ / 2);
+    core::Runtime::runtime_singleton_->system_deallocator()(ring_buf_);
  }

  ring_buf_ = NULL;
@@ -537,7 +537,8 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {

  kernarg_async_ = reinterpret_cast<KernelArgs*>(
      core::Runtime::runtime_singleton_->system_allocator()(
-          queue_->public_handle()->size * AlignUp(sizeof(KernelArgs), 16), 16));
+          queue_->public_handle()->size * AlignUp(sizeof(KernelArgs), 16), 16,
+          core::MemoryRegion::AllocateNoFlags));

  kernarg_async_mask_ = queue_->public_handle()->size - 1;

@@ -447,26 +447,11 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
  // Allocate queue buffer.
  queue_size_ = kQueueSize;

-  HsaMemFlags flags;
-  flags.Value = 0;
-  flags.ui32.HostAccess = 1;
-  flags.ui32.AtomicAccessPartial = 1;
-  flags.ui32.ExecuteAccess = 1;
+  queue_start_addr_ =
+      (char*)core::Runtime::runtime_singleton_->system_allocator()(
+          queue_size_, 0x1000, core::MemoryRegion::AllocateExecutable);

-  auto err = hsaKmtAllocMemory(amd_gpu_agent.node_id(), queue_size_, flags,
-                               reinterpret_cast<void**>(&queue_start_addr_));
-
-  if (err != HSAKMT_STATUS_SUCCESS) {
-    assert(false && "SDMA queue memory allocation failure.");
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
-
-  HSAuint64 alternate_va;
-  err = hsaKmtMapMemoryToGPU(queue_start_addr_, queue_size_, &alternate_va);
-
-  if (err != HSAKMT_STATUS_SUCCESS) {
-    assert(false && "AQL queue memory map failure.");
-    Destroy(agent);
+  if (queue_start_addr_ == NULL) {
    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
  }

@@ -494,7 +479,8 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {

  fence_base_addr_ = reinterpret_cast<uint32_t*>(
      core::Runtime::runtime_singleton_->system_allocator()(
-          fence_pool_size_ * sizeof(uint32_t), 256));
+          fence_pool_size_ * sizeof(uint32_t), 256,
+          core::MemoryRegion::AllocateNoFlags));

  if (fence_base_addr_ == NULL) {
    Destroy(agent);
@@ -516,8 +502,7 @@ hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {

  if (queue_start_addr_ != NULL && queue_size_ != 0) {
    // Release queue buffer.
-    hsaKmtUnmapMemoryToGPU(queue_start_addr_);
-    hsaKmtFreeMemory(queue_start_addr_, queue_size_);
+    core::Runtime::runtime_singleton_->system_deallocator()(queue_start_addr_);
  }

  if (fence_base_addr_ != NULL) {
@@ -124,14 +124,8 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
  // Populate region list.
  InitRegionList();

-  // Reserve memory for scratch.
-  InitScratchPool();
-
  // Populate cache list.
  InitCacheList();
-
-  // Bind the second-level trap handler to this node.
-  BindTrapHandler();
 }

 GpuAgent::~GpuAgent() {
@@ -214,21 +208,13 @@ void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
  }

  // Allocate a GPU-visible buffer for the shader.
-  HsaMemFlags code_buf_flags = {0};
-  code_buf_flags.ui32.HostAccess = 1;
-  code_buf_flags.ui32.ExecuteAccess = 1;
-  code_buf_flags.ui32.NoSubstitute = 1;
-
  size_t header_size =
      (assemble_target == AssembleTarget::AQL ? sizeof(amd_kernel_code_t) : 0);
  code_buf_size = AlignUp(header_size + asic_shader->size, 0x1000);

-  HSAKMT_STATUS err =
-      hsaKmtAllocMemory(node_id(), code_buf_size, code_buf_flags, &code_buf);
-  assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAllocMemory(Trap) failed");
-
-  err = hsaKmtMapMemoryToGPU(code_buf, code_buf_size, NULL);
-  assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtMapMemoryToGPU(Trap) failed");
+  code_buf = core::Runtime::runtime_singleton_->system_allocator()(
+      code_buf_size, 0x1000, core::MemoryRegion::AllocateExecutable);
+  assert(code_buf != NULL && "Code buffer allocation failed");

  memset(code_buf, 0, code_buf_size);

@@ -265,8 +251,7 @@ void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
 }

 void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) const {
-  hsaKmtUnmapMemoryToGPU(code_buf);
-  hsaKmtFreeMemory(code_buf, code_buf_size);
+  core::Runtime::runtime_singleton_->system_deallocator()(code_buf);
 }

 void GpuAgent::InitRegionList() {
@@ -415,7 +400,8 @@ bool GpuAgent::InitEndTsPool() {

  uint64_t* buff = NULL;
  if (HSA_STATUS_SUCCESS !=
-      runtime->AllocateMemory(true, local_region_, alloc_size,
+      runtime->AllocateMemory(local_region_, alloc_size,
+                              MemoryRegion::AllocateRestrict,
                              reinterpret_cast<void**>(&buff))) {
    return false;
  }
@@ -589,6 +575,10 @@ void GpuAgent::InitDma() {
 }

 hsa_status_t GpuAgent::PostToolsInit() {
+  // Defer memory allocation until agents have been discovered.
+  InitScratchPool();
+  BindTrapHandler();
+
  // Defer utility queue creation to allow tools to intercept.
  queues_[QueueUtility] = CreateInterceptibleQueue();

@@ -149,11 +149,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile,

 MemoryRegion::~MemoryRegion() {}

-hsa_status_t MemoryRegion::Allocate(size_t size, void** address) const {
-  return Allocate(false, size, address);
-}
-
-hsa_status_t MemoryRegion::Allocate(bool restrict_access, size_t size,
+hsa_status_t MemoryRegion::Allocate(size_t size, AllocateFlags alloc_flags,
                                    void** address) const {
  if (address == NULL) {
    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
@@ -169,7 +165,13 @@ hsa_status_t MemoryRegion::Allocate(bool restrict_access, size_t size,

  size = AlignUp(size, kPageSize_);

-  *address = AllocateKfdMemory(mem_flag_, owner()->node_id(), size);
+  HsaMemFlags kmt_alloc_flags(mem_flag_);
+  kmt_alloc_flags.ui32.ExecuteAccess =
+      (alloc_flags & AllocateExecutable ? 1 : 0);
+  kmt_alloc_flags.ui32.AQLQueueMemory =
+      (alloc_flags & AllocateDoubleMap ? 1 : 0);
+
+  *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);

  if (*address != NULL) {
    // Commit the memory.
@@ -184,7 +186,7 @@ hsa_status_t MemoryRegion::Allocate(bool restrict_access, size_t size,
    const uint32_t* map_node_id = &owner_node_id;

    if (IsSystem()) {
-      if (!restrict_access) {
+      if ((alloc_flags & AllocateRestrict) == 0) {
        // Map to all GPU agents.
        map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size();

@@ -903,8 +903,8 @@ hsa_status_t
  const core::MemoryRegion* mem_region = core::MemoryRegion::Convert(region);
  IS_VALID(mem_region);

-  return core::Runtime::runtime_singleton_->AllocateMemory(mem_region, size,
-                                                           ptr);
+  return core::Runtime::runtime_singleton_->AllocateMemory(
+      mem_region, size, core::MemoryRegion::AllocateNoFlags, ptr);
 }

 hsa_status_t hsa_memory_free(void* ptr) {
@@ -464,8 +464,8 @@ hsa_status_t
    return (hsa_status_t)HSA_STATUS_ERROR_INVALID_MEMORY_POOL;
  }

-  return core::Runtime::runtime_singleton_->AllocateMemory(true, mem_region,
-                                                           size, ptr);
+  return core::Runtime::runtime_singleton_->AllocateMemory(
+      mem_region, size, core::MemoryRegion::AllocateRestrict, ptr);
 }

 hsa_status_t hsa_amd_memory_pool_free(void* ptr) {
@@ -159,29 +159,22 @@ void Runtime::RegisterAgent(Agent* agent) {
    // Init default fine grain system region allocator using fine grain
    // system region of the first discovered CPU agent.
    if (cpu_agents_.size() == 1) {
-      if (system_regions_fine_[0]->full_profile()) {
-        system_allocator_ = [](size_t size, size_t alignment) -> void * {
-          return _aligned_malloc(size, alignment);
-        };
+      // Might need memory pooling to cover allocation that
+      // requires less than 4096 bytes.
+      system_allocator_ =
+          [&](size_t size, size_t alignment,
+              MemoryRegion::AllocateFlags alloc_flags) -> void* {
+            assert(alignment <= 4096);
+            void* ptr = NULL;
+            return (HSA_STATUS_SUCCESS ==
+                    core::Runtime::runtime_singleton_->AllocateMemory(
+                        system_regions_fine_[0], size, alloc_flags, &ptr))
+                       ? ptr
+                       : NULL;
+          };

-        system_deallocator_ = [](void* ptr) { _aligned_free(ptr); };
-      } else {
-        // Might need memory pooling to cover allocation that
-        // requires less than 4096 bytes.
-        system_allocator_ = [&](size_t size, size_t alignment) -> void * {
-          assert(alignment <= 4096);
-          void* ptr = NULL;
-          return (HSA_STATUS_SUCCESS ==
-                  core::Runtime::runtime_singleton_->AllocateMemory(
-                      system_regions_fine_[0], size, &ptr))
-                     ? ptr
-                     : NULL;
-        };
-
-        system_deallocator_ = [](void* ptr) {
-          core::Runtime::runtime_singleton_->FreeMemory(ptr);
-        };
-      }
+      system_deallocator_ =
+          [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };

      BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_);
    }
@@ -307,16 +300,9 @@ hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent,
 }

 hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
-                                     void** ptr) {
-  return AllocateMemory(false, region, size, ptr);
-}
-
-hsa_status_t Runtime::AllocateMemory(bool restrict_access,
-                                     const MemoryRegion* region, size_t size,
+                                     MemoryRegion::AllocateFlags alloc_flags,
                                     void** address) {
-  const amd::MemoryRegion* amd_region =
-      reinterpret_cast<const amd::MemoryRegion*>(region);
-  hsa_status_t status = amd_region->Allocate(restrict_access, size, address);
+  hsa_status_t status = region->Allocate(size, alloc_flags, address);

  // Track the allocation result so that it could be freed properly.
  if (status == HSA_STATUS_SUCCESS) {