From 117be0b55a6e2bb3fa5a71dce5e96514cf1beeba Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Mon, 11 Sep 2017 20:31:20 -0500 Subject: [PATCH] Add suballocator for ordinary VRAM allocations smaller than 2MB. Track pointer info for sub 2MB fragment allocations in allocation_map_. Add fragment support to IPC. Change-Id: I00cfc2e2fa289aac90a4718c392f9bb056a61a87 --- .../hsa-runtime/core/inc/amd_memory_region.h | 15 ++ runtime/hsa-runtime/core/inc/memory_region.h | 1 + runtime/hsa-runtime/core/inc/runtime.h | 13 +- .../core/runtime/amd_memory_region.cpp | 33 ++- runtime/hsa-runtime/core/runtime/runtime.cpp | 175 ++++++++++--- runtime/hsa-runtime/core/util/locks.h | 9 +- runtime/hsa-runtime/core/util/simple_heap.h | 247 ++++++++++++++++++ runtime/hsa-runtime/inc/hsa_ext_amd.h | 2 +- 8 files changed, 448 insertions(+), 47 deletions(-) create mode 100644 runtime/hsa-runtime/core/util/simple_heap.h diff --git a/runtime/hsa-runtime/core/inc/amd_memory_region.h b/runtime/hsa-runtime/core/inc/amd_memory_region.h index d2321dfd4e..f411c05c2f 100644 --- a/runtime/hsa-runtime/core/inc/amd_memory_region.h +++ b/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -49,6 +49,7 @@ #include "core/inc/agent.h" #include "core/inc/memory_region.h" +#include "core/util/simple_heap.h" #include "inc/hsa_ext_amd.h" @@ -181,7 +182,21 @@ class MemoryRegion : public core::MemoryRegion { HSAuint64 virtual_size_; static const size_t kPageSize_ = 4096; + + class BlockAllocator { + private: + MemoryRegion& region_; + static const size_t block_size_ = 2 * 1024 * 1024; // 2MB blocks. + public: + explicit BlockAllocator(MemoryRegion& region) : region_(region) {} + void* alloc(size_t request_size, size_t& allocated_size) const; + void free(void* ptr, size_t length) const { region_.Free(ptr, length); } + size_t block_size() const { return block_size_; } + }; + + mutable SimpleHeap fragment_allocator_; }; + } // namespace #endif // header guard diff --git a/runtime/hsa-runtime/core/inc/memory_region.h b/runtime/hsa-runtime/core/inc/memory_region.h index 502ebb38d4..bea4250086 100644 --- a/runtime/hsa-runtime/core/inc/memory_region.h +++ b/runtime/hsa-runtime/core/inc/memory_region.h @@ -85,6 +85,7 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { AllocateRestrict = (1 << 0), // Don't map system memory to GPU agents AllocateExecutable = (1 << 1), // Set executable permission AllocateDoubleMap = (1 << 2), // Map twice VA allocation to backing store + AllocateDirect = (1 << 3), // Bypass fragment cache. }; typedef uint32_t AllocateFlags; diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h index 06eff531ab..1a18149db3 100644 --- a/runtime/hsa-runtime/core/inc/runtime.h +++ b/runtime/hsa-runtime/core/inc/runtime.h @@ -264,8 +264,14 @@ class Runtime { hsa_status_t InteropUnmap(void* ptr); + struct PtrInfoBlockData { + void* base; + size_t length; + }; + hsa_status_t PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t), - uint32_t* num_agents_accessible, hsa_agent_t** accessible); + uint32_t* num_agents_accessible, hsa_agent_t** accessible, + PtrInfoBlockData* block_info = nullptr); hsa_status_t SetPtrInfoData(void* ptr, void* userptr); @@ -315,12 +321,13 @@ class Runtime { static void AsyncEventsLoop(void*); struct AllocationRegion { - AllocationRegion() : region(NULL), size(0) {} + AllocationRegion() : region(NULL), size(0), user_ptr(nullptr) {} AllocationRegion(const MemoryRegion* region_arg, size_t size_arg) - : region(region_arg), size(size_arg) {} + : region(region_arg), size(size_arg), user_ptr(nullptr) {} const MemoryRegion* region; size_t size; + void* user_ptr; }; struct AsyncEventsControl { diff --git a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index 00642f3de0..1e44e14236 100644 --- a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -49,6 +49,7 @@ #include "core/inc/amd_cpu_agent.h" #include "core/inc/amd_gpu_agent.h" #include "core/util/utils.h" +#include "core/inc/exceptions.h" namespace amd { void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, @@ -98,13 +99,13 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) { hsaKmtUnmapMemoryToGPU(const_cast(ptr)); } -MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, - core::Agent* owner, +MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner, const HsaMemoryProperties& mem_props) : core::MemoryRegion(fine_grain, full_profile, owner), mem_props_(mem_props), max_single_alloc_size_(0), - virtual_size_(0) { + virtual_size_(0), + fragment_allocator_(BlockAllocator(*this)) { virtual_size_ = GetPhysicalSize(); mem_flag_.Value = 0; @@ -169,6 +170,15 @@ hsa_status_t MemoryRegion::Allocate(size_t size, AllocateFlags alloc_flags, kmt_alloc_flags.ui32.AQLQueueMemory = (alloc_flags & AllocateDoubleMap ? 1 : 0); + // Only allow using the suballocator for ordinary VRAM. + bool useSubAlloc = IsLocalMemory(); + useSubAlloc &= (alloc_flags == AllocateRestrict); + useSubAlloc &= (size <= fragment_allocator_.max_alloc()); + if (useSubAlloc) { + *address = fragment_allocator_.alloc(size); + return HSA_STATUS_SUCCESS; + } + *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size); if (*address != NULL) { @@ -220,6 +230,8 @@ hsa_status_t MemoryRegion::Allocate(size_t size, AllocateFlags alloc_flags, } hsa_status_t MemoryRegion::Free(void* address, size_t size) const { + if (fragment_allocator_.free(address)) return HSA_STATUS_SUCCESS; + MakeKfdMemoryUnresident(address); FreeKfdMemory(address, size); @@ -586,4 +598,19 @@ hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size, return HSA_STATUS_SUCCESS; } +void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const { + assert(request_size < block_size() && "BlockAllocator alloc request exceeds block size."); + + void* ret; + hsa_status_t err = region_.Allocate( + block_size(), core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, + &ret); + if (err != HSA_STATUS_SUCCESS) + throw new ::AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed."); + assert(ret != nullptr && "Region returned nullptr on success."); + + allocated_size = block_size(); + return ret; +} + } // namespace diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index 1913e179e7..9f3bebca37 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -321,11 +321,11 @@ hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size, } hsa_status_t Runtime::FreeMemory(void* ptr) { - if (ptr == NULL) { + if (ptr == nullptr) { return HSA_STATUS_SUCCESS; } - const MemoryRegion* region = NULL; + const MemoryRegion* region = nullptr; size_t size = 0; ScopedAcquire lock(&memory_lock_); @@ -333,11 +333,14 @@ hsa_status_t Runtime::FreeMemory(void* ptr) { if (it == allocation_map_.end()) { assert(false && "Can't find address in allocation map"); - return HSA_STATUS_ERROR; + return HSA_STATUS_ERROR_INVALID_ARGUMENT; } region = it->second.region; size = it->second.size; + // Imported fragments can't be released with FreeMemory. + if (region == nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + allocation_map_.erase(it); return region->Free(ptr, size); @@ -681,7 +684,17 @@ hsa_status_t Runtime::InteropUnmap(void* ptr) { } hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t), - uint32_t* num_agents_accessible, hsa_agent_t** accessible) { + uint32_t* num_agents_accessible, hsa_agent_t** accessible, + PtrInfoBlockData* block_info) { + static_assert(static_cast(HSA_POINTER_UNKNOWN) == static_cast(HSA_EXT_POINTER_TYPE_UNKNOWN), + "Thunk pointer info mismatch"); + static_assert(static_cast(HSA_POINTER_ALLOCATED) == static_cast(HSA_EXT_POINTER_TYPE_HSA), + "Thunk pointer info mismatch"); + static_assert(static_cast(HSA_POINTER_REGISTERED_USER) == static_cast(HSA_EXT_POINTER_TYPE_LOCKED), + "Thunk pointer info mismatch"); + static_assert(static_cast(HSA_POINTER_REGISTERED_GRAPHICS) == static_cast(HSA_EXT_POINTER_TYPE_GRAPHICS), + "Thunk pointer info mismatch"); + HsaPointerInfo thunkInfo; uint32_t* mappedNodes; @@ -692,36 +705,50 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a bool returnListData = ((alloc != nullptr) && (num_agents_accessible != nullptr) && (accessible != nullptr)); - if (returnListData) { - size_t max_agents = cpu_agents_.size() + gpu_agents_.size(); - mappedNodes = (uint32_t*)alloca(max_agents * sizeof(uint32_t)); - // memory_lock protects access to the NMappedNodes array since this changes with calls to memory - // APIs. + + { // memory_lock protects access to the NMappedNodes array and fragment user data since these may + // change with calls to memory APIs. ScopedAcquire lock(&memory_lock_); hsaKmtQueryPointerInfo(ptr, &thunkInfo); - assert(thunkInfo.NMappedNodes <= max_agents && - "PointerInfo: Thunk returned more than all agents in NMappedNodes."); - memcpy(mappedNodes, thunkInfo.MappedNodes, thunkInfo.NMappedNodes * sizeof(uint32_t)); - } else { - hsaKmtQueryPointerInfo(ptr, &thunkInfo); - } - - static_assert((int)HSA_POINTER_UNKNOWN == (int)HSA_EXT_POINTER_TYPE_UNKNOWN, - "Thunk pointer info mismatch"); - static_assert((int)HSA_POINTER_ALLOCATED == (int)HSA_EXT_POINTER_TYPE_HSA, - "Thunk pointer info mismatch"); - static_assert((int)HSA_POINTER_REGISTERED_USER == (int)HSA_EXT_POINTER_TYPE_LOCKED, - "Thunk pointer info mismatch"); - static_assert((int)HSA_POINTER_REGISTERED_GRAPHICS == (int)HSA_EXT_POINTER_TYPE_GRAPHICS, - "Thunk pointer info mismatch"); + if (returnListData) { + assert(thunkInfo.NMappedNodes <= agents_by_node_.size() && + "PointerInfo: Thunk returned more than all agents in NMappedNodes."); + mappedNodes = (uint32_t*)alloca(thunkInfo.NMappedNodes * sizeof(uint32_t)); + memcpy(mappedNodes, thunkInfo.MappedNodes, thunkInfo.NMappedNodes * sizeof(uint32_t)); + } + retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type; + retInfo.agentBaseAddress = reinterpret_cast(thunkInfo.GPUAddress); + retInfo.hostBaseAddress = thunkInfo.CPUAddress; + retInfo.sizeInBytes = thunkInfo.SizeInBytes; + retInfo.userData = thunkInfo.UserData; + if (block_info != nullptr) { + block_info->base = retInfo.hostBaseAddress; + block_info->length = retInfo.sizeInBytes; + } + if (retInfo.type == HSA_EXT_POINTER_TYPE_HSA) { + auto fragment = allocation_map_.upper_bound(ptr); + if (fragment != allocation_map_.begin()) { + fragment--; + if ((fragment->first <= ptr) && + (ptr < reinterpret_cast(fragment->first) + fragment->second.size)) { + retInfo.hostBaseAddress = const_cast(fragment->first); + retInfo.agentBaseAddress = retInfo.hostBaseAddress; + retInfo.sizeInBytes = fragment->second.size; + retInfo.userData = fragment->second.user_ptr; + } + } + } + } // end lock scope retInfo.size = Min(info->size, sizeof(hsa_amd_pointer_info_t)); - retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type; - retInfo.agentBaseAddress = reinterpret_cast(thunkInfo.GPUAddress); - retInfo.hostBaseAddress = thunkInfo.CPUAddress; - retInfo.sizeInBytes = thunkInfo.SizeInBytes; - retInfo.userData = thunkInfo.UserData; - retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle(); + + // Temp: workaround thunk bug, IPC memory has garbage in Node. + // retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle(); + auto it = agents_by_node_.find(thunkInfo.Node); + if (it != agents_by_node_.end()) + retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle(); + else + retInfo.agentOwner.handle = 0; memcpy(info, &retInfo, retInfo.size); @@ -751,19 +778,50 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a } hsa_status_t Runtime::SetPtrInfoData(void* ptr, void* userptr) { + { // Use allocation map if possible to handle fragments. + ScopedAcquire lock(&memory_lock_); + const auto& it = allocation_map_.find(ptr); + if (it != allocation_map_.end()) { + it->second.user_ptr = userptr; + return HSA_STATUS_SUCCESS; + } + } + // Cover entries not in the allocation map (graphics, lock,...) if (hsaKmtSetMemoryUserData(ptr, userptr) == HSAKMT_STATUS_SUCCESS) return HSA_STATUS_SUCCESS; - else - return HSA_STATUS_ERROR_INVALID_ARGUMENT; + return HSA_STATUS_ERROR_INVALID_ARGUMENT; } hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* handle) { static_assert(sizeof(hsa_amd_ipc_memory_t) == sizeof(HsaSharedMemoryHandle), "Thunk IPC mismatch."); - if (hsaKmtShareMemory(ptr, len, (HsaSharedMemoryHandle*)handle) == HSAKMT_STATUS_SUCCESS) - return HSA_STATUS_SUCCESS; - else + // Reject sharing allocations larger than ~8TB due to thunk limitations. + if (len > 0x7FFFFFFF000ull) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + // Check for fragment sharing. + PtrInfoBlockData block; + hsa_amd_pointer_info_t info; + info.size = sizeof(info); + if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + if ((block.base != ptr) || (block.length != len)) { + if (!IsMultipleOf(block.base, 2 * 1024 * 1024)) { + assert(false && "Fragment's block not aligned to 2MB!"); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + if (hsaKmtShareMemory(block.base, block.length, reinterpret_cast( + handle)) != HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + uint32_t offset = + (reinterpret_cast(ptr) - reinterpret_cast(block.base)) / 4096; + // Holds size in (4K?) pages in thunk handle: Mark as a fragment and denote offset. + handle->handle[6] |= 0x80000000 | offset; + } else { + if (hsaKmtShareMemory(ptr, len, reinterpret_cast(handle)) != + HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return HSA_STATUS_SUCCESS; } hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, uint32_t num_agents, @@ -772,14 +830,36 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, void* importAddress; HSAuint64 importSize; HSAuint64 altAddress; + + hsa_amd_ipc_memory_t importHandle; + importHandle = *handle; + + // Extract fragment info + bool isFragment = false; + uint32_t fragOffset = 0; + auto fixFragment = [&]() { + if (!isFragment) return; + importAddress = reinterpret_cast(importAddress) + fragOffset; + len = Min(len, importSize - fragOffset); + ScopedAcquire lock(&memory_lock_); + allocation_map_[importAddress] = AllocationRegion(nullptr, len); + }; + + if ((importHandle.handle[6] & 0x80000000) != 0) { + isFragment = true; + fragOffset = (importHandle.handle[6] & 0x1FF) * 4096; + importHandle.handle[6] &= ~(0x80000000 | 0x1FF); + } + if (num_agents == 0) { - if (hsaKmtRegisterSharedHandle(reinterpret_cast(handle), + if (hsaKmtRegisterSharedHandle(reinterpret_cast(&importHandle), &importAddress, &importSize) != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR_INVALID_ARGUMENT; if (hsaKmtMapMemoryToGPU(importAddress, importSize, &altAddress) != HSAKMT_STATUS_SUCCESS) { hsaKmtDeregisterMemory(importAddress); return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } + fixFragment(); *mapped_ptr = importAddress; return HSA_STATUS_SUCCESS; } @@ -798,9 +878,9 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, for (int i = 0; i < num_agents; i++) agents[i]->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_DRIVER_NODE_ID, &nodes[i]); - if (hsaKmtRegisterSharedHandleToNodes(reinterpret_cast(handle), - &importAddress, &importSize, num_agents, - nodes) != HSAKMT_STATUS_SUCCESS) + if (hsaKmtRegisterSharedHandleToNodes( + reinterpret_cast(&importHandle), &importAddress, + &importSize, num_agents, nodes) != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR_INVALID_ARGUMENT; HsaMemMapFlags map_flags; @@ -816,11 +896,28 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, } } + fixFragment(); *mapped_ptr = importAddress; return HSA_STATUS_SUCCESS; } hsa_status_t Runtime::IPCDetach(void* ptr) { + { // Handle imported fragments. + ScopedAcquire lock(&memory_lock_); + const auto& it = allocation_map_.find(ptr); + if (it != allocation_map_.end()) { + if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + allocation_map_.erase(it); + lock.Release(); // Can't hold memory lock when using pointer info. + + PtrInfoBlockData block; + hsa_amd_pointer_info_t info; + info.size = sizeof(info); + if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + ptr = block.base; + } + } if (hsaKmtUnmapMemoryToGPU(ptr) != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR_INVALID_ARGUMENT; if (hsaKmtDeregisterMemory(ptr) != HSAKMT_STATUS_SUCCESS) diff --git a/runtime/hsa-runtime/core/util/locks.h b/runtime/hsa-runtime/core/util/locks.h index bab52f91e0..c9ff9ee7b2 100644 --- a/runtime/hsa-runtime/core/util/locks.h +++ b/runtime/hsa-runtime/core/util/locks.h @@ -59,7 +59,14 @@ class ScopedAcquire { explicit ScopedAcquire(LockType* lock) : lock_(lock) { lock_->Acquire(); } /// @brief: when destructing, release the lock. - ~ScopedAcquire() { lock_->Release(); } + ~ScopedAcquire() { + if (lock_ != nullptr) lock_->Release(); + } + + void Release() { + lock_->Release(); + lock_ = nullptr; + } private: LockType* lock_; diff --git a/runtime/hsa-runtime/core/util/simple_heap.h b/runtime/hsa-runtime/core/util/simple_heap.h new file mode 100644 index 0000000000..76df7ad73d --- /dev/null +++ b/runtime/hsa-runtime/core/util/simple_heap.h @@ -0,0 +1,247 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// A simple best fit memory allocator with eager compaction. Manages block sub-allocation. +// For use when memory efficiency is more important than allocation speed. +// O(log n) time. + +#ifndef HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_ +#define HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_ + +#include +#include +#include + +#include "core/util/utils.h" + +template class SimpleHeap { + private: + struct Fragment_T { + typedef std::multimap::iterator ptr_t; + ptr_t free_list_entry_; + size_t size; + + Fragment_T(ptr_t Iterator, size_t Len) : free_list_entry_(Iterator), size(Len) {} + Fragment_T() = default; + }; + + struct Block { + uintptr_t base_ptr_; + size_t length_; + + Block(uintptr_t base, size_t length) : base_ptr_(base), length_(length) {} + Block() = default; + }; + + Allocator block_allocator_; + + std::multimap free_list_; + std::map> block_list_; + std::deque block_cache_; + + size_t in_use_size_; + size_t cache_size_; + + __forceinline bool isFree(const Fragment_T& node) { + return node.free_list_entry_ != free_list_.end(); + } + __forceinline void setUsed(Fragment_T& node) { node.free_list_entry_ = free_list_.end(); } + __forceinline void setFree(Fragment_T& node, typename Fragment_T::ptr_t Iterator) { + node.free_list_entry_ = Iterator; + } + __forceinline Fragment_T makeFragment(size_t Len) { return Fragment_T(free_list_.end(), Len); } + __forceinline Fragment_T makeFragment(typename Fragment_T::ptr_t Iterator, size_t Len) { + return Fragment_T(Iterator, Len); + } + + public: + explicit SimpleHeap(const Allocator& BlockAllocator = Allocator()) + : block_allocator_(BlockAllocator), in_use_size_(0), cache_size_(0) {} + ~SimpleHeap() { + trim(); + // Leak here may be due to the user. Check is for debugging only. + // assert(in_use_size_ == 0 && "Leak in SimpleHeap."); + } + + SimpleHeap(const SimpleHeap& rhs) = delete; + SimpleHeap(SimpleHeap&& rhs) = delete; + SimpleHeap& operator=(const SimpleHeap& rhs) = delete; + SimpleHeap& operator=(SimpleHeap&& rhs) = delete; + + void* alloc(size_t bytes) { + if (bytes > max_alloc()) { + assert(false && "Requested allocation is larger than block size."); + throw std::bad_alloc(); + return nullptr; + } + + // Find best fit. + auto free_fragment = free_list_.lower_bound(bytes); + uintptr_t base; + size_t size; + + if (free_fragment != free_list_.end()) { + base = free_fragment->second; + size = free_fragment->first; + free_list_.erase(free_fragment); + + assert(size >= bytes && "SimpleHeap: map lower_bound failure."); + + // Find the containing block and fragment + auto it = block_list_.upper_bound(base); + it--; + auto& frag_map = it->second; + const auto& fragment = frag_map.find(base); + + assert(fragment != frag_map.end() && "Inconsistency in SimpleHeap."); + assert(size == fragment->second.size && "Inconsistency in SimpleHeap."); + + // Sub-allocate from fragment. + fragment->second.size = bytes; + setUsed(fragment->second); + // Record remaining free space. + if (size > bytes) { + free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes)); + frag_map[base + bytes] = makeFragment(free_fragment, size - bytes); + } + return reinterpret_cast(base); + } + + // No usable fragment, check block cache + if (!block_cache_.empty()) { + const auto& block = block_cache_.back(); + base = block.base_ptr_; + size = block.length_; + block_cache_.pop_back(); + cache_size_ -= size; + } else { // Alloc new block + void* ptr = block_allocator_.alloc(bytes, size); + base = reinterpret_cast(ptr); + assert(ptr != nullptr && "Block allocation failed, Allocator is expected to throw."); + } + + in_use_size_ += size; + assert(size >= bytes && "Alloc exceeds block size."); + // Sub alloc and insert free region. + if (size > bytes) { + free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes)); + block_list_[base][base + bytes] = makeFragment(free_fragment, size - bytes); + } + // Track used region + block_list_[base][base] = makeFragment(bytes); + + return reinterpret_cast(base); + } + + bool free(void* ptr) { + if (ptr == nullptr) return true; + + uintptr_t base = reinterpret_cast(ptr); + + // Find fragment and validate. + auto frag_map_it = block_list_.upper_bound(base); + if (frag_map_it == block_list_.begin()) return false; + frag_map_it--; + auto& frag_map = frag_map_it->second; + auto fragment = frag_map.find(base); + if (fragment == frag_map.end() || isFree(fragment->second)) return false; + + // Merge lower + if (fragment != frag_map.begin()) { + auto lower = fragment; + lower--; + if (isFree(lower->second)) { + free_list_.erase(lower->second.free_list_entry_); + lower->second.size += fragment->second.size; + frag_map.erase(fragment); + fragment = lower; + } + } + + // Merge upper + { + auto upper = fragment; + upper++; + if ((upper != frag_map.end()) && isFree(upper->second)) { + free_list_.erase(upper->second.free_list_entry_); + fragment->second.size += upper->second.size; + frag_map.erase(upper); + } + } + + // Move whole free blocks to block cache + if (frag_map.size() == 1) { + in_use_size_ -= fragment->second.size; + cache_size_ += fragment->second.size; + block_cache_.push_back(Block(fragment->first, fragment->second.size)); + block_list_.erase(frag_map_it); + + // Release old blocks when over cache limit. + while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) { + const auto& block = block_cache_.front(); + block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); + cache_size_ -= block.length_; + block_cache_.pop_front(); + } + + // Don't publish free space since block was moved to the cache. + return true; + } + + // Report free fragment + const auto& freeEntry = + free_list_.insert(std::make_pair(fragment->second.size, fragment->first)); + setFree(fragment->second, freeEntry); + + return true; + } + + void trim() { + for (const auto& block : block_cache_) + block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); + block_cache_.clear(); + } + + size_t max_alloc() const { return block_allocator_.block_size(); } +}; + +#endif // HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_ diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h index d4a1841240..905aa31851 100755 --- a/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -1488,7 +1488,7 @@ typedef struct hsa_amd_ipc_memory_s { * any process. In general applications should confirm that a shared memory * region has been attached (via hsa_amd_ipc_memory_attach) in the remote * process prior to releasing that memory in the local process. - * Repeated calls for the same allocaiton may, but are not required to, return + * Repeated calls for the same allocation may, but are not required to, return * unique handles. * * @param[in] ptr Pointer to memory allocated via ROCr APIs to prepare for