From 0100fa968cacb80ef5c7d99b0a3f44cde3f52dbe Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Thu, 10 Jun 2021 19:26:08 -0500 Subject: [PATCH] Allocate any size vram request through the fragment allocator. Enables the fragment allocator to handle >2MB allocations, maintaining good TLB alignment. Prior code contained a bug that caused the effective API granule for vram allocations >2MB to be bumped to 2MB. Also adjusts the block cache's block retention heuristic to not count discarded blocks as in use. This will reduce block retention when a significant amount of large blocks or IPC is in use. Change-Id: I30bd85eb87951df822211f799d9cfe579ab109c6 [ROCm/ROCR-Runtime commit: 8adbda1c1856c998b3aec365f00bc9c1ca287f4a] --- .../core/runtime/amd_memory_region.cpp | 12 ++--- .../hsa-runtime/core/util/simple_heap.h | 54 ++++++++++++------- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index 4419286fd9..d96e5db294 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -188,15 +188,10 @@ hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, voi // Avoid modifying executable or queue allocations. bool useSubAlloc = subAllocEnabled; useSubAlloc &= ((alloc_flags & (~AllocateRestrict)) == 0); - useSubAlloc &= (size <= fragment_allocator_.max_alloc()); if (useSubAlloc) { *address = fragment_allocator_.alloc(size); return HSA_STATUS_SUCCESS; } - if (subAllocEnabled) { - // Pad up larger VRAM allocations. - size = AlignUp(size, fragment_allocator_.max_alloc()); - } } // Allocate memory. @@ -702,17 +697,16 @@ hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size, void MemoryRegion::Trim() const { fragment_allocator_.trim(); } void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const { - assert(request_size <= block_size() && "BlockAllocator alloc request exceeds block size."); - void* ret; - size_t bsize = block_size(); + size_t bsize = AlignUp(request_size, block_size()); + hsa_status_t err = region_.Allocate( bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret); if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed."); assert(ret != nullptr && "Region returned nullptr on success."); - allocated_size = block_size(); + allocated_size = bsize; return ret; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/simple_heap.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/simple_heap.h index a5a5c1a859..51eb38eb84 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/simple_heap.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/simple_heap.h @@ -85,7 +85,9 @@ template class SimpleHeap { std::map> block_list_; std::deque block_cache_; + // Size of blocks that are at least partially in use. size_t in_use_size_; + // Total size of block cache size_t cache_size_; __forceinline bool isFree(const Fragment_T& node) { return node.free; } @@ -129,12 +131,6 @@ template class SimpleHeap { SimpleHeap& operator=(SimpleHeap&& rhs) = delete; void* alloc(size_t bytes) { - if (bytes > max_alloc()) { - assert(false && "Requested allocation is larger than block size."); - throw std::bad_alloc(); - return nullptr; - } - // Find best fit. auto free_fragment = free_list_.lower_bound(bytes); uintptr_t base; @@ -168,13 +164,13 @@ template class SimpleHeap { } // No usable fragment, check block cache - if (!block_cache_.empty()) { + if (bytes < default_block_size() && !block_cache_.empty()) { const auto& block = block_cache_.back(); base = block.base_ptr_; size = block.length_; block_cache_.pop_back(); cache_size_ -= size; - } else { // Alloc new block + } else { // Alloc new block - new block may be larger than default. void* ptr = block_allocator_.alloc(bytes, size); base = reinterpret_cast(ptr); assert(ptr != nullptr && "Block allocation failed, Allocator is expected to throw."); @@ -190,6 +186,13 @@ template class SimpleHeap { // Track used region block_list_[base][base] = makeFragment(bytes); + // Disallow multiple suballocation from large blocks. + // Prevents a small allocation from retaining a large block. + if (bytes > default_block_size()) { + bool err = discardBlock(reinterpret_cast(base)); + assert(err && "Large block discard failed."); + } + return reinterpret_cast(base); } @@ -234,7 +237,6 @@ template class SimpleHeap { // Release whole free blocks. if (frag_map.size() == 1) { Block block(fragment->first, fragment->second.size); - in_use_size_ -= block.length_; block_list_.erase(frag_map_it); // Discard or add to the block cache. @@ -243,15 +245,10 @@ template class SimpleHeap { } else { block_cache_.push_back(block); cache_size_ += block.length_; + in_use_size_ -= block.length_; } - // Release old blocks when over cache limit. - while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) { - const auto& block = block_cache_.front(); - block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); - cache_size_ -= block.length_; - block_cache_.pop_front(); - } + balance(); // Don't publish free space since block was moved to the cache. return true; @@ -268,6 +265,16 @@ template class SimpleHeap { return true; } + void balance() { + // Release old blocks when over cache limit. + while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) { + const auto& block = block_cache_.front(); + block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); + cache_size_ -= block.length_; + block_cache_.pop_front(); + } + } + void trim() { for (const auto& block : block_cache_) block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); @@ -275,7 +282,7 @@ template class SimpleHeap { cache_size_ = 0; } - size_t max_alloc() const { return block_allocator_.block_size(); } + size_t default_block_size() const { return block_allocator_.block_size(); } // Prevent reuse of the block containing ptr. No further fragments will be allocated from the // block and the block will not be added to the block cache when it is free. @@ -293,8 +300,17 @@ template class SimpleHeap { (frag_map.rbegin()->first + frag_map.rbegin()->second.size <= base)) return false; - // Mark all fragments for discard. Removes freelist records for all fragments in the block. - for (auto& frag : frag_map) discard(frag.second); + // Mark all fragments for discard and compute block size. Removes freelist records for all + // fragments in the block. + size_t size = 0; + for (auto& frag : frag_map) { + discard(frag.second); + size += frag.second.size; + } + + // Remove discarded block from in-use tracking and rebalance the block cache. + in_use_size_ -= size; + balance(); return true; }