diff --git a/runtime/hsa-runtime/core/common/shared.cpp b/runtime/hsa-runtime/core/common/shared.cpp index 2adfda93bd..19ae5b2632 100644 --- a/runtime/hsa-runtime/core/common/shared.cpp +++ b/runtime/hsa-runtime/core/common/shared.cpp @@ -44,7 +44,7 @@ namespace rocr { namespace core { -std::function BaseShared::allocate_ = nullptr; +std::function BaseShared::allocate_ = nullptr; std::function BaseShared::free_ = nullptr; } // namespace core } // namespace rocr diff --git a/runtime/hsa-runtime/core/common/shared.h b/runtime/hsa-runtime/core/common/shared.h index 47000d4d5d..592295c529 100644 --- a/runtime/hsa-runtime/core/common/shared.h +++ b/runtime/hsa-runtime/core/common/shared.h @@ -58,14 +58,14 @@ namespace core { class BaseShared { public: static void SetAllocateAndFree( - const std::function& allocate, + const std::function& allocate, const std::function& free) { allocate_ = allocate; free_ = free; } protected: - static std::function allocate_; + static std::function allocate_; static std::function free_; }; @@ -73,7 +73,19 @@ class BaseShared { template class PageAllocator : private BaseShared { public: __forceinline static T* alloc(int flags = 0) { - T* ret = reinterpret_cast(allocate_(AlignUp(sizeof(T), 4096), 4096, flags)); + T* ret = reinterpret_cast(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, 0)); + if (ret == nullptr) throw std::bad_alloc(); + + MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); }); + + new (ret) T; + + throwGuard.Dismiss(); + return ret; + } + + __forceinline static T* alloc(int agent_node_id, int flags) { + T* ret = reinterpret_cast(allocate_(AlignUp(sizeof(T), 4096), 4096, flags, agent_node_id)); if (ret == nullptr) throw std::bad_alloc(); MAKE_NAMED_SCOPE_GUARD(throwGuard, [&]() { free_(ret); }); @@ -107,6 +119,16 @@ class Shared final : private BaseShared { shared_object_ = PageAllocator::alloc(flags); } + explicit Shared(int agent_node_id, Allocator* pool = nullptr, int flags = 0) : pool_(pool) { + assert(allocate_ != nullptr && free_ != nullptr && + "Shared object allocator is not set"); + + if (pool_) + shared_object_ = pool_->alloc(); + else + shared_object_ = PageAllocator::alloc(agent_node_id, flags); + } + ~Shared() { assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set"); @@ -147,6 +169,12 @@ template class Shared> final : private BaseShar shared_object_ = PageAllocator::alloc(flags); } + Shared(int agent_node_id, int flags) { + assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set"); + + shared_object_ = PageAllocator::alloc(agent_node_id, flags); + } + ~Shared() { assert(allocate_ != nullptr && free_ != nullptr && "Shared object allocator is not set"); @@ -183,7 +211,7 @@ template class SharedArray final : private BaseShared static_assert((__alignof(T) <= Align) || (Align == 0), "Align is less than alignof(T)"); shared_object_ = - reinterpret_cast(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0)); + reinterpret_cast(allocate_(sizeof(T) * length, Max(__alignof(T), Align), 0, 0)); if (shared_object_ == nullptr) throw std::bad_alloc(); size_t i = 0; diff --git a/runtime/hsa-runtime/core/inc/amd_memory_region.h b/runtime/hsa-runtime/core/inc/amd_memory_region.h index 51df2c4f81..0d774dece0 100644 --- a/runtime/hsa-runtime/core/inc/amd_memory_region.h +++ b/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -100,7 +100,7 @@ class MemoryRegion : public core::MemoryRegion { ~MemoryRegion(); - hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const; + hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id = 0) const; hsa_status_t Free(void* address, size_t size) const; @@ -200,7 +200,7 @@ class MemoryRegion : public core::MemoryRegion { const core::Runtime::LinkInfo& link_info) const; // Operational body for Allocate. Recursive. - hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address) const; + hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const; // Operational body for Free. Recursive. hsa_status_t FreeImpl(void* address, size_t size) const; diff --git a/runtime/hsa-runtime/core/inc/memory_region.h b/runtime/hsa-runtime/core/inc/memory_region.h index 0357972cdf..f36b195e7e 100644 --- a/runtime/hsa-runtime/core/inc/memory_region.h +++ b/runtime/hsa-runtime/core/inc/memory_region.h @@ -99,11 +99,15 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { AllocateAsan = (1 << 6), // ASAN - First page of allocation remapped to system memory AllocatePinned = (1 << 7), // Currently treating Pinned memory as NoSubstitute AllocateMemoryOnly = (1 << 8), // Memory only handle from thunk, no virtual address + // Flag to allocate system memory with GTT Access + // Note: The node_id needs to be the node_id of the device even though this is allocating + // system memory + AllocateGTTAccess = (1 << 9), }; typedef uint32_t AllocateFlags; - virtual hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const = 0; + virtual hsa_status_t Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const = 0; virtual hsa_status_t Free(void* address, size_t size) const = 0; diff --git a/runtime/hsa-runtime/core/inc/queue.h b/runtime/hsa-runtime/core/inc/queue.h index 468c073282..1125102344 100644 --- a/runtime/hsa-runtime/core/inc/queue.h +++ b/runtime/hsa-runtime/core/inc/queue.h @@ -162,6 +162,7 @@ struct SharedQueue { class LocalQueue { public: LocalQueue(int mem_flags) : local_queue_(mem_flags) {} + LocalQueue(int agent_node_id, int mem_flags) : local_queue_(agent_node_id, mem_flags) {} SharedQueue* queue() const { return local_queue_.shared_object(); } private: @@ -183,6 +184,11 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue { public_handle_ = Convert(this); } + Queue(int agent_node_id, int mem_flags = 0) : LocalQueue(agent_node_id, mem_flags), amd_queue_(queue()->amd_queue) { + queue()->core_queue = this; + public_handle_ = Convert(this); + } + virtual ~Queue() {} virtual void Destroy() { delete this; } diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h index 257e31bbe1..6f5997f7e4 100644 --- a/runtime/hsa-runtime/core/inc/runtime.h +++ b/runtime/hsa-runtime/core/inc/runtime.h @@ -198,7 +198,7 @@ class Runtime { /// @retval ::HSA_STATUS_SUCCESS If allocation is successful. hsa_status_t AllocateMemory(const MemoryRegion* region, size_t size, MemoryRegion::AllocateFlags alloc_flags, - void** address); + void** address, int agent_node_id = 0); /// @brief Free memory previously allocated with AllocateMemory. /// @@ -419,7 +419,7 @@ class Runtime { amd::hsa::code::AmdHsaCodeManager* code_manager() { return &code_manager_; } - std::function& + std::function& system_allocator() { return system_allocator_; } @@ -659,7 +659,7 @@ class Runtime { prefetch_map_t prefetch_map_; // Allocator using ::system_region_ - std::function system_allocator_; + std::function system_allocator_; // Deallocator using ::system_region_ std::function system_deallocator_; diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 21f0b7d926..fbcae5f6e9 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -80,7 +80,7 @@ int AqlQueue::rtti_id_ = 0; AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, ScratchInfo& scratch, core::HsaEventCallback callback, void* err_data, bool is_kv) - : Queue(agent->isMES() ? MemoryRegion::AllocateNonPaged : 0), + : Queue(agent->node_id(), agent->isMES() ? (MemoryRegion::AllocateGTTAccess | MemoryRegion::AllocateNonPaged) : 0), LocalSignal(0, false), DoorbellSignal(signal()), ring_buf_(nullptr), diff --git a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index d9a16f19d3..f28434d80e 100644 --- a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -59,8 +59,7 @@ namespace AMD { size_t MemoryRegion::max_sysmem_alloc_size_ = 0; size_t MemoryRegion::kPageSize_ = sysconf(_SC_PAGESIZE); -void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, - HSAuint32 node_id, size_t size) { +void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, size_t size) { void* ret = NULL; const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flag, &ret); return (status == HSAKMT_STATUS_SUCCESS) ? ret : NULL; @@ -170,13 +169,13 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, MemoryRegion::~MemoryRegion() {} -hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const { +hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address, int agent_node_id) const { ScopedAcquire lock(&owner()->agent_memory_lock_); - return AllocateImpl(size, alloc_flags, address); + return AllocateImpl(size, alloc_flags, address, agent_node_id); } hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags, - void** address) const { + void** address, int agent_node_id) const { if (address == NULL) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -209,6 +208,8 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags, kmt_alloc_flags.ui32.CoarseGrain = (alloc_flags & AllocatePCIeRW ? 0 : kmt_alloc_flags.ui32.CoarseGrain); kmt_alloc_flags.ui32.NoSubstitute = (alloc_flags & AllocatePinned ? 1 : kmt_alloc_flags.ui32.NoSubstitute); + kmt_alloc_flags.ui32.GTTAccess = (alloc_flags & AllocateGTTAccess ? 1 : kmt_alloc_flags.ui32.GTTAccess); + // Only allow using the suballocator for ordinary VRAM. if (IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) { bool subAllocEnabled = !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc(); @@ -228,12 +229,14 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags, } } + const HSAuint32 node_id = (alloc_flags & AllocateGTTAccess) ? agent_node_id : owner()->node_id(); + // Allocate memory. // If it fails attempt to release memory from the block allocator and retry. - *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size); + *address = AllocateKfdMemory(kmt_alloc_flags, node_id, size); if (*address == nullptr) { owner()->Trim(); - *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size); + *address = AllocateKfdMemory(kmt_alloc_flags, node_id, size); } if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS; @@ -768,7 +771,7 @@ void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated size_t bsize = AlignUp(request_size, block_size()); hsa_status_t err = region_.AllocateImpl( - bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret); + bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret, 0); if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed."); assert(ret != nullptr && "Region returned nullptr on success."); diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index f674c7ff49..ec37c154a8 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -208,12 +208,12 @@ void Runtime::RegisterAgent(Agent* agent, bool Enabled) { for (auto pool : system_regions_fine_) { if (pool->kernarg()) { system_allocator_ = [pool](size_t size, size_t alignment, - MemoryRegion::AllocateFlags alloc_flags) -> void* { + MemoryRegion::AllocateFlags alloc_flags, int agent_node_id) -> void* { assert(alignment <= 4096); void* ptr = NULL; return (HSA_STATUS_SUCCESS == core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags, - &ptr)) + &ptr, agent_node_id)) ? ptr : NULL; }; @@ -311,9 +311,9 @@ hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent, hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size, MemoryRegion::AllocateFlags alloc_flags, - void** address) { + void** address, int agent_node_id) { size_t size_requested = size; // region->Allocate(...) may align-up size to granularity - hsa_status_t status = region->Allocate(size, alloc_flags, address); + hsa_status_t status = region->Allocate(size, alloc_flags, address, agent_node_id); // Track the allocation result so that it could be freed properly. if (status == HSA_STATUS_SUCCESS) { ScopedAcquire lock(&memory_lock_); @@ -497,7 +497,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) { requires the caller to specify all allowed agents we can't assume that a peer mapped pointer would remain mapped for the duration of the copy. */ - void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags); + void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags, 0); MAKE_SCOPE_GUARD([&]() { system_deallocator_(temp); }); hsa_status_t err = src_agent->DmaCopy(temp, source, size); if (err == HSA_STATUS_SUCCESS) err = dst_agent->DmaCopy(dst, temp, size); @@ -3005,7 +3005,7 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz ScopedAcquire lock(&memory_lock_); void* thunk_handle; - hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle); + hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle, 0); if (status == HSA_STATUS_SUCCESS) { memory_handle_map_.emplace(std::piecewise_construct, std::forward_as_tuple(thunk_handle), diff --git a/runtime/hsa-runtime/core/runtime/signal.cpp b/runtime/hsa-runtime/core/runtime/signal.cpp index 1ad41e2ff6..eee62f5951 100644 --- a/runtime/hsa-runtime/core/runtime/signal.cpp +++ b/runtime/hsa-runtime/core/runtime/signal.cpp @@ -73,11 +73,11 @@ SharedSignal* SharedSignalPool_t::alloc() { ScopedAcquire lock(&lock_); if (free_list_.empty()) { SharedSignal* block = reinterpret_cast( - allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0)); + allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0)); if (block == nullptr) { block_size_ = minblock_; block = reinterpret_cast( - allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0)); + allocate_(block_size_ * sizeof(SharedSignal), __alignof(SharedSignal), 0, 0)); if (block == nullptr) throw std::bad_alloc(); }