Create fine-grained allocator

Create allocator helper function to provide fine-grained memory on a specific agent. Change-Id: I32ba9aceb9c9dc708b140a0c45158e6e7a018844
2023-08-23 20:50:36 +00:00
@@ -411,6 +411,13 @@ class GpuAgent : public GpuAgentInt {

  const std::function<void(void*)>& system_deallocator() const { return system_deallocator_; }

+  const std::function<void*(size_t size, core::MemoryRegion::AllocateFlags flags)>&
+  finegrain_allocator() const {
+    return finegrain_allocator_;
+  }
+
+  const std::function<void(void*)>& finegrain_deallocator() const { return finegrain_deallocator_; }
+
 protected:
  // Sizes are in packets.
  static const uint32_t minAqlSize_ = 0x40;     // 4KB min
@@ -581,8 +588,8 @@ class GpuAgent : public GpuAgentInt {
  // @brief Setup GWS accessing queue.
  void InitGWS();

-  // @brief Setup NUMA aware system memory allocator.
-  void InitNumaAllocator();
+  // @brief Set-up memory allocators
+  void InitAllocators();

  // @brief Initialize scratch handler thresholds
  void InitAsyncScratchThresholds();
@@ -657,6 +664,10 @@ class GpuAgent : public GpuAgentInt {

  std::function<void(void*)> system_deallocator_;

+  // Fine grain allocator on this device
+  std::function<void*(size_t size, core::MemoryRegion::AllocateFlags flags)> finegrain_allocator_;
+
+  std::function<void(void*)> finegrain_deallocator_;
  // @brief device handle
  amdgpu_device_handle ldrm_dev_;

@@ -96,7 +96,7 @@ class MemoryRegion : public core::MemoryRegion {
  static void MakeKfdMemoryUnresident(const void* ptr);

  MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain,
-               core::Agent* owner, const HsaMemoryProperties& mem_props);
+               bool user_visible, core::Agent* owner, const HsaMemoryProperties& mem_props);

  ~MemoryRegion();

@@ -58,11 +58,12 @@ class Agent;
 class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
 public:
  MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain,
-               core::Agent* owner)
+               bool user_visible, core::Agent* owner)
      : fine_grain_(fine_grain),
        kernarg_(kernarg),
        full_profile_(full_profile),
        extended_scope_fine_grain_(extended_scope_fine_grain),
+        user_visible_(user_visible),
        owner_(owner) {
    assert(owner_ != NULL);
  }
@@ -132,6 +133,8 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {

  __forceinline bool full_profile() const { return full_profile_; }

+  __forceinline bool user_visible() const { return user_visible_; }
+
  __forceinline core::Agent* owner() const { return owner_; }

 private:
@@ -139,6 +142,8 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
  const bool kernarg_;
  const bool full_profile_;
  const bool extended_scope_fine_grain_;
+  const bool user_visible_;
+
  core::Agent* owner_;
 };
 }  // namespace core
@@ -85,15 +85,15 @@ void CpuAgent::InitRegionList() {
    if (system_prop != mem_props.end()) system_props = *system_prop;

    MemoryRegion* system_region_fine =
-        new MemoryRegion(true, false, is_apu_node, false, this, system_props);
+        new MemoryRegion(true, false, is_apu_node, false, true, this, system_props);
    regions_.push_back(system_region_fine);
    MemoryRegion* system_region_kernarg =
-        new MemoryRegion(true, true, is_apu_node, false, this, system_props);
+        new MemoryRegion(true, true, is_apu_node, false, true, this, system_props);
    regions_.push_back(system_region_kernarg);

    if (!is_apu_node) {
      MemoryRegion* system_region_coarse =
-          new MemoryRegion(false, false, is_apu_node, false, this, system_props);
+          new MemoryRegion(false, false, is_apu_node, false, true, this, system_props);
      regions_.push_back(system_region_coarse);
    }
  }
@@ -152,6 +152,7 @@ hsa_status_t CpuAgent::VisitRegion(
    hsa_status_t (*callback)(hsa_region_t region, void* data),
    void* data) const {
  for (const core::MemoryRegion* region : regions) {
+    if (!region->user_visible()) continue;
    hsa_region_t region_handle = core::MemoryRegion::Convert(region);
    hsa_status_t status = callback(region_handle, data);
    if (status != HSA_STATUS_SUCCESS) {
@@ -448,19 +448,20 @@ void GpuAgent::InitRegionList() {
        case HSA_HEAPTYPE_GPU_LDS:
        case HSA_HEAPTYPE_GPU_SCRATCH: {
          MemoryRegion* region =
-              new MemoryRegion(false, false, false, false, this, mem_props[mem_idx]);
+              new MemoryRegion(false, false, false, false, true, this, mem_props[mem_idx]);

          regions_.push_back(region);

          if (region->IsLocalMemory()) {
            regions_.push_back(
-                new MemoryRegion(false, false, false, true, this, mem_props[mem_idx]));
+                new MemoryRegion(false, false, false, true, true, this, mem_props[mem_idx]));
+
            // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI.
-            if ((properties_.HiveID != 0) ||
-                (core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) {
-              regions_.push_back(
-                  new MemoryRegion(true, false, false, false, this, mem_props[mem_idx]));
-            }
+            bool user_visible = (properties_.HiveID != 0) ||
+                core::Runtime::runtime_singleton_->flag().fine_grain_pcie();
+
+            regions_.push_back(new MemoryRegion(true, false, false, false, user_visible, this,
+                                                mem_props[mem_idx]));
          }
          break;
        }
@@ -650,6 +651,8 @@ hsa_status_t GpuAgent::VisitRegion(
    void* data) const {
  AMD::callback_t<decltype(callback)> call(callback);
  for (const core::MemoryRegion* region : regions) {
+    if (!region->user_visible()) continue;
+
    const AMD::MemoryRegion* amd_region =
        reinterpret_cast<const AMD::MemoryRegion*>(region);

@@ -850,7 +853,7 @@ void GpuAgent::PreloadBlits() {

 hsa_status_t GpuAgent::PostToolsInit() {
  // Defer memory allocation until agents have been discovered.
-  InitNumaAllocator();
+  InitAllocators();
  InitScratchPool();
  BindTrapHandler();
  InitDma();
@@ -2241,7 +2244,7 @@ void GpuAgent::Trim() {
  scratch_cache_.trim(false);
 }

-void GpuAgent::InitNumaAllocator() {
+void GpuAgent::InitAllocators() {
  for (auto pool : GetNearestCpuAgent()->regions()) {
    if (pool->kernarg()) {
      system_allocator_ = [pool](size_t size, size_t alignment,
@@ -2255,11 +2258,29 @@ void GpuAgent::InitNumaAllocator() {
      };

      system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
-
-      return;
    }
  }
-  assert(false && "Nearest NUMA node did not have a kernarg pool.");
+  assert(system_allocator_ && "Nearest NUMA node did not have a kernarg pool.");
+
+  // Setup fine-grain allocator
+  for (auto region : regions()) {
+    const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region;
+    if (amd_region->IsLocalMemory() && amd_region->fine_grain()) {
+      finegrain_allocator_ = [region](size_t size,
+                                      MemoryRegion::AllocateFlags alloc_flags) -> void* {
+        void* ptr = nullptr;
+        return (HSA_STATUS_SUCCESS ==
+                core::Runtime::runtime_singleton_->AllocateMemory(region, size, alloc_flags, &ptr))
+            ? ptr
+            : nullptr;
+      };
+
+      finegrain_deallocator_ = [](void* ptr) {
+        core::Runtime::runtime_singleton_->FreeMemory(ptr);
+      };
+    }
+  }
+  assert(finegrain_deallocator_ && "Agent does not have a fine-grain allocator");
 }

 core::Agent* GpuAgent::GetNearestCpuAgent() const {
@@ -102,9 +102,10 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) {
 }

 MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
-                           bool extended_scope_fine_grain, core::Agent* owner,
+                           bool extended_scope_fine_grain, bool user_visible, core::Agent* owner,
                           const HsaMemoryProperties& mem_props)
-    : core::MemoryRegion(fine_grain, kernarg, full_profile, extended_scope_fine_grain, owner),
+    : core::MemoryRegion(fine_grain, kernarg, full_profile, extended_scope_fine_grain, user_visible,
+                         owner),
      mem_props_(mem_props),
      max_single_alloc_size_(0),
      virtual_size_(0),