Temporary: Set AllocateGTTAccess and node_id for MES

Temporary change to set the AllocateGTTAccess flag and node_id on MES devices. Change-Id: I22385d11b17b76cfb44278fa0d8a09bc8721cea6
2024-03-07 23:54:17 +00:00
Parent 9e8f185397
@@ -44,7 +44,7 @@

 namespace rocr {
 namespace core {
-std::function<void*(size_t, size_t, uint32_t)> BaseShared::allocate_ = nullptr;
+std::function<void*(size_t, size_t, uint32_t, int)> BaseShared::allocate_ = nullptr;
 std::function<void(void*)> BaseShared::free_ = nullptr;
 }   // namespace core
 }   // namespace rocr
@@ -58,14 +58,14 @@ namespace core {
 class BaseShared {
 public:
  static void SetAllocateAndFree(
-      const std::function<void*(size_t, size_t, uint32_t)>& allocate,
+      const std::function<void*(size_t, size_t, uint32_t, int)>& allocate,
      const std::function<void(void*)>& free) {
    allocate_ = allocate;
    free_ = free;
  }

 protected:
-  static std::function<void*(size_t, size_t, uint32_t)> allocate_;
+  static std::function<void*(size_t, size_t, uint32_t, int)> allocate_;
  static std::function<void(void*)> free_;
 };

@@ -73,7 +73,19 @@ class BaseShared {
 template <typename T> class PageAllocator : private BaseShared {
 public:
  __forceinline static T* alloc(int flags = 0) {
-    T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags));
+    T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, 0));
+    if (ret == nullptr) throw std::bad_alloc();
+
+    MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); });
+
+    new (ret) T;
+
+    throwGuard.Dismiss();
+    return ret;
+  }
+
+  __forceinline static T* alloc(int agent_node_id, int flags) {
+    T* ret = reinterpret_cast<T*>(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, agent_node_id));
    if (ret == nullptr) throw std::bad_alloc();

    MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); });
@@ -107,6 +119,16 @@ class Shared final : private BaseShared {
      shared_object_ = PageAllocator<T>::alloc(flags);
  }

+  explicit Shared(int agent_node_id, Allocator* pool = nullptr, int flags = 0) : pool_(pool) {
+    assert(allocate_ != nullptr && free_ != nullptr &&
+           "Shared object allocator is not set");
+
+    if (pool_)
+      shared_object_ = pool_->alloc();
+    else
+      shared_object_ = PageAllocator<T>::alloc(agent_node_id, flags);
+  }
+
  ~Shared() {
    assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set");

@@ -147,6 +169,12 @@ template <typename T> class Shared<T, PageAllocator<T>> final : private BaseShar
    shared_object_ = PageAllocator<T>::alloc(flags);
  }

+  Shared(int agent_node_id, int flags) {
+    assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set");
+
+    shared_object_ = PageAllocator<T>::alloc(agent_node_id, flags);
+  }
+
  ~Shared() {
    assert(allocate_ != nullptr && free_ != nullptr &&
           "Shared object allocator is not set");
@@ -183,7 +211,7 @@ template <typename T, size_t Align> class SharedArray final : private BaseShared
    static_assert((__alignof(T) <= Align) || (Align == 0), "Align is less than alignof(T)");

    shared_object_ =
-        reinterpret_cast<T*>(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0));
+        reinterpret_cast<T*>(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0, 0));
    if (shared_object_ == nullptr) throw std::bad_alloc();

    size_t i = 0;
@@ -100,7 +100,7 @@ class MemoryRegion : public core::MemoryRegion {

  ~MemoryRegion();

-  hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const;
+  hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id = 0) const;

  hsa_status_t Free(void* address, size_t size) const;

@@ -200,7 +200,7 @@ class MemoryRegion : public core::MemoryRegion {
                                             const core::Runtime::LinkInfo& link_info) const;

  // Operational body for Allocate.  Recursive.
-  hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address) const;
+  hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const;

  // Operational body for Free.  Recursive.
  hsa_status_t FreeImpl(void* address, size_t size) const;
@@ -99,11 +99,15 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
    AllocateAsan = (1 << 6),        // ASAN - First page of allocation remapped to system memory
    AllocatePinned = (1 << 7),      // Currently treating Pinned memory as NoSubstitute
    AllocateMemoryOnly = (1 << 8),  // Memory only handle from thunk, no virtual address
+    // Flag to allocate system memory with GTT Access
+    // Note: The node_id needs to be the node_id of the device even though this is allocating
+    // system memory
+    AllocateGTTAccess = (1 << 9),
  };

  typedef uint32_t AllocateFlags;

-  virtual hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const = 0;
+  virtual hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const = 0;

  virtual hsa_status_t Free(void* address, size_t size) const = 0;

@@ -162,6 +162,7 @@ struct SharedQueue {
 class LocalQueue {
 public:
  LocalQueue(int mem_flags) : local_queue_(mem_flags) {}
+  LocalQueue(int agent_node_id, int mem_flags) : local_queue_(agent_node_id, mem_flags) {}
  SharedQueue* queue() const { return local_queue_.shared_object(); }

 private:
@@ -183,6 +184,11 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue {
    public_handle_ = Convert(this);
  }

+  Queue(int agent_node_id, int mem_flags = 0) : LocalQueue(agent_node_id, mem_flags), amd_queue_(queue()->amd_queue) {
+    queue()->core_queue = this;
+    public_handle_ = Convert(this);
+  }
+
  virtual ~Queue() {}

  virtual void Destroy() { delete this; }
@@ -198,7 +198,7 @@ class Runtime {
  /// @retval ::HSA_STATUS_SUCCESS If allocation is successful.
  hsa_status_t AllocateMemory(const MemoryRegion* region, size_t size,
                              MemoryRegion::AllocateFlags alloc_flags,
-                              void** address);
+                              void** address, int agent_node_id = 0);

  /// @brief Free memory previously allocated with AllocateMemory.
  ///
@@ -419,7 +419,7 @@ class Runtime {

  amd::hsa::code::AmdHsaCodeManager* code_manager() { return &code_manager_; }

-  std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)>&
+  std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags, int agent_node_id)>&
  system_allocator() {
    return system_allocator_;
  }
@@ -659,7 +659,7 @@ class Runtime {
  prefetch_map_t prefetch_map_;

  // Allocator using ::system_region_
-  std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)> system_allocator_;
+  std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags, int agent_node_id)> system_allocator_;

  // Deallocator using ::system_region_
  std::function<void(void*)> system_deallocator_;
@@ -80,7 +80,7 @@ int AqlQueue::rtti_id_ = 0;

 AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, ScratchInfo& scratch,
                   core::HsaEventCallback callback, void* err_data, bool is_kv)
-    : Queue(agent->isMES() ? MemoryRegion::AllocateNonPaged : 0),
+    : Queue(agent->node_id(), agent->isMES() ? (MemoryRegion::AllocateGTTAccess | MemoryRegion::AllocateNonPaged) : 0),
      LocalSignal(0, false),
      DoorbellSignal(signal()),
      ring_buf_(nullptr),
@@ -59,8 +59,7 @@ namespace AMD {
 size_t MemoryRegion::max_sysmem_alloc_size_ = 0;
 size_t MemoryRegion::kPageSize_ = sysconf(_SC_PAGESIZE);

-void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag,
-                                      HSAuint32 node_id, size_t size) {
+void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, size_t size) {
  void* ret = NULL;
  const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flag, &ret);
  return (status == HSAKMT_STATUS_SUCCESS) ? ret : NULL;
@@ -170,13 +169,13 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,

 MemoryRegion::~MemoryRegion() {}

-hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const {
+hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const {
  ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
-  return AllocateImpl(size, alloc_flags, address);
+  return AllocateImpl(size, alloc_flags, address, agent_node_id);
 }

 hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
-                                        void** address) const {
+                                        void** address, int agent_node_id) const {
  if (address == NULL) {
    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }
@@ -209,6 +208,8 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
  kmt_alloc_flags.ui32.CoarseGrain = (alloc_flags & AllocatePCIeRW ? 0 : kmt_alloc_flags.ui32.CoarseGrain);
  kmt_alloc_flags.ui32.NoSubstitute = (alloc_flags & AllocatePinned ? 1 : kmt_alloc_flags.ui32.NoSubstitute);

+  kmt_alloc_flags.ui32.GTTAccess = (alloc_flags & AllocateGTTAccess ? 1 : kmt_alloc_flags.ui32.GTTAccess);
+
  // Only allow using the suballocator for ordinary VRAM.
  if (IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
    bool subAllocEnabled = !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc();
@@ -228,12 +229,14 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
    }
  }

+  const HSAuint32 node_id = (alloc_flags & AllocateGTTAccess) ? agent_node_id : owner()->node_id();
+
  // Allocate memory.
  // If it fails attempt to release memory from the block allocator and retry.
-  *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
+  *address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
  if (*address == nullptr) {
    owner()->Trim();
-    *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
+    *address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
  }

  if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS;
@@ -768,7 +771,7 @@ void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated
  size_t bsize = AlignUp(request_size, block_size());

  hsa_status_t err = region_.AllocateImpl(
-      bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret);
+      bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret, 0);
  if (err != HSA_STATUS_SUCCESS)
    throw AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed.");
  assert(ret != nullptr && "Region returned nullptr on success.");
@@ -208,12 +208,12 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) {
      for (auto pool : system_regions_fine_) {
        if (pool->kernarg()) {
          system_allocator_ = [pool](size_t size, size_t alignment,
-                                     MemoryRegion::AllocateFlags alloc_flags) -> void* {
+                                     MemoryRegion::AllocateFlags alloc_flags, int agent_node_id) -> void* {
            assert(alignment <= 4096);
            void* ptr = NULL;
            return (HSA_STATUS_SUCCESS ==
                    core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags,
-                                                                      &ptr))
+                                                                      &ptr, agent_node_id))
                ? ptr
                : NULL;
          };
@@ -311,9 +311,9 @@ hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent,

 hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
                                     MemoryRegion::AllocateFlags alloc_flags,
-                                     void** address) {
+                                     void** address, int agent_node_id) {
  size_t size_requested = size;  // region->Allocate(...) may align-up size to granularity
-  hsa_status_t status = region->Allocate(size, alloc_flags, address);
+  hsa_status_t status = region->Allocate(size, alloc_flags, address, agent_node_id);
  // Track the allocation result so that it could be freed properly.
  if (status == HSA_STATUS_SUCCESS) {
    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
@@ -497,7 +497,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
  requires the caller to specify all allowed agents we can't assume that a peer mapped pointer
  would remain mapped for the duration of the copy.
  */
-  void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags);
+  void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags, 0);
  MAKE_SCOPE_GUARD([&]() { system_deallocator_(temp); });
  hsa_status_t err = src_agent->DmaCopy(temp, source, size);
  if (err == HSA_STATUS_SUCCESS) err = dst_agent->DmaCopy(dst, temp, size);
@@ -3005,7 +3005,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz

  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
  void* thunk_handle;
-  hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle);
+  hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle, 0);
  if (status == HSA_STATUS_SUCCESS) {
    memory_handle_map_.emplace(std::piecewise_construct,
          std::forward_as_tuple(thunk_handle),
@@ -73,11 +73,11 @@ SharedSignal* SharedSignalPool_t::alloc() {
  ScopedAcquire<HybridMutex> lock(&lock_);
  if (free_list_.empty()) {
    SharedSignal* block = reinterpret_cast<SharedSignal*>(
-        allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0));
+        allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0));
    if (block == nullptr) {
      block_size_ = minblock_;
      block = reinterpret_cast<SharedSignal*>(
-          allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0));
+          allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0));
      if (block == nullptr) throw std::bad_alloc();
    }