Add suballocator for ordinary VRAM allocations smaller than 2MB.

Track pointer info for sub 2MB fragment allocations in allocation_map_.

Add fragment support to IPC.

Change-Id: I00cfc2e2fa289aac90a4718c392f9bb056a61a87
Этот коммит содержится в:
Sean Keely
2017-09-11 20:31:20 -05:00
родитель ae4a9c4d91
Коммит 117be0b55a
8 изменённых файлов: 448 добавлений и 47 удалений
+15
Просмотреть файл
@@ -49,6 +49,7 @@
#include "core/inc/agent.h"
#include "core/inc/memory_region.h"
#include "core/util/simple_heap.h"
#include "inc/hsa_ext_amd.h"
@@ -181,7 +182,21 @@ class MemoryRegion : public core::MemoryRegion {
HSAuint64 virtual_size_;
static const size_t kPageSize_ = 4096;
class BlockAllocator {
private:
MemoryRegion& region_;
static const size_t block_size_ = 2 * 1024 * 1024; // 2MB blocks.
public:
explicit BlockAllocator(MemoryRegion& region) : region_(region) {}
void* alloc(size_t request_size, size_t& allocated_size) const;
void free(void* ptr, size_t length) const { region_.Free(ptr, length); }
size_t block_size() const { return block_size_; }
};
mutable SimpleHeap<BlockAllocator> fragment_allocator_;
};
} // namespace
#endif // header guard
+1
Просмотреть файл
@@ -85,6 +85,7 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
AllocateRestrict = (1 << 0), // Don't map system memory to GPU agents
AllocateExecutable = (1 << 1), // Set executable permission
AllocateDoubleMap = (1 << 2), // Map twice VA allocation to backing store
AllocateDirect = (1 << 3), // Bypass fragment cache.
};
typedef uint32_t AllocateFlags;
+10 -3
Просмотреть файл
@@ -264,8 +264,14 @@ class Runtime {
hsa_status_t InteropUnmap(void* ptr);
struct PtrInfoBlockData {
void* base;
size_t length;
};
hsa_status_t PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t),
uint32_t* num_agents_accessible, hsa_agent_t** accessible);
uint32_t* num_agents_accessible, hsa_agent_t** accessible,
PtrInfoBlockData* block_info = nullptr);
hsa_status_t SetPtrInfoData(void* ptr, void* userptr);
@@ -315,12 +321,13 @@ class Runtime {
static void AsyncEventsLoop(void*);
struct AllocationRegion {
AllocationRegion() : region(NULL), size(0) {}
AllocationRegion() : region(NULL), size(0), user_ptr(nullptr) {}
AllocationRegion(const MemoryRegion* region_arg, size_t size_arg)
: region(region_arg), size(size_arg) {}
: region(region_arg), size(size_arg), user_ptr(nullptr) {}
const MemoryRegion* region;
size_t size;
void* user_ptr;
};
struct AsyncEventsControl {
+30 -3
Просмотреть файл
@@ -49,6 +49,7 @@
#include "core/inc/amd_cpu_agent.h"
#include "core/inc/amd_gpu_agent.h"
#include "core/util/utils.h"
#include "core/inc/exceptions.h"
namespace amd {
void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag,
@@ -98,13 +99,13 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) {
hsaKmtUnmapMemoryToGPU(const_cast<void*>(ptr));
}
MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile,
core::Agent* owner,
MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner,
const HsaMemoryProperties& mem_props)
: core::MemoryRegion(fine_grain, full_profile, owner),
mem_props_(mem_props),
max_single_alloc_size_(0),
virtual_size_(0) {
virtual_size_(0),
fragment_allocator_(BlockAllocator(*this)) {
virtual_size_ = GetPhysicalSize();
mem_flag_.Value = 0;
@@ -169,6 +170,15 @@ hsa_status_t MemoryRegion::Allocate(size_t size, AllocateFlags alloc_flags,
kmt_alloc_flags.ui32.AQLQueueMemory =
(alloc_flags & AllocateDoubleMap ? 1 : 0);
// Only allow using the suballocator for ordinary VRAM.
bool useSubAlloc = IsLocalMemory();
useSubAlloc &= (alloc_flags == AllocateRestrict);
useSubAlloc &= (size <= fragment_allocator_.max_alloc());
if (useSubAlloc) {
*address = fragment_allocator_.alloc(size);
return HSA_STATUS_SUCCESS;
}
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
if (*address != NULL) {
@@ -220,6 +230,8 @@ hsa_status_t MemoryRegion::Allocate(size_t size, AllocateFlags alloc_flags,
}
hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
if (fragment_allocator_.free(address)) return HSA_STATUS_SUCCESS;
MakeKfdMemoryUnresident(address);
FreeKfdMemory(address, size);
@@ -586,4 +598,19 @@ hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size,
return HSA_STATUS_SUCCESS;
}
void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const {
assert(request_size < block_size() && "BlockAllocator alloc request exceeds block size.");
void* ret;
hsa_status_t err = region_.Allocate(
block_size(), core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect,
&ret);
if (err != HSA_STATUS_SUCCESS)
throw new ::AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed.");
assert(ret != nullptr && "Region returned nullptr on success.");
allocated_size = block_size();
return ret;
}
} // namespace
+136 -39
Просмотреть файл
@@ -321,11 +321,11 @@ hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
}
hsa_status_t Runtime::FreeMemory(void* ptr) {
if (ptr == NULL) {
if (ptr == nullptr) {
return HSA_STATUS_SUCCESS;
}
const MemoryRegion* region = NULL;
const MemoryRegion* region = nullptr;
size_t size = 0;
ScopedAcquire<KernelMutex> lock(&memory_lock_);
@@ -333,11 +333,14 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
if (it == allocation_map_.end()) {
assert(false && "Can't find address in allocation map");
return HSA_STATUS_ERROR;
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
region = it->second.region;
size = it->second.size;
// Imported fragments can't be released with FreeMemory.
if (region == nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
allocation_map_.erase(it);
return region->Free(ptr, size);
@@ -681,7 +684,17 @@ hsa_status_t Runtime::InteropUnmap(void* ptr) {
}
hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t),
uint32_t* num_agents_accessible, hsa_agent_t** accessible) {
uint32_t* num_agents_accessible, hsa_agent_t** accessible,
PtrInfoBlockData* block_info) {
static_assert(static_cast<int>(HSA_POINTER_UNKNOWN) == static_cast<int>(HSA_EXT_POINTER_TYPE_UNKNOWN),
"Thunk pointer info mismatch");
static_assert(static_cast<int>(HSA_POINTER_ALLOCATED) == static_cast<int>(HSA_EXT_POINTER_TYPE_HSA),
"Thunk pointer info mismatch");
static_assert(static_cast<int>(HSA_POINTER_REGISTERED_USER) == static_cast<int>(HSA_EXT_POINTER_TYPE_LOCKED),
"Thunk pointer info mismatch");
static_assert(static_cast<int>(HSA_POINTER_REGISTERED_GRAPHICS) == static_cast<int>(HSA_EXT_POINTER_TYPE_GRAPHICS),
"Thunk pointer info mismatch");
HsaPointerInfo thunkInfo;
uint32_t* mappedNodes;
@@ -692,36 +705,50 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
bool returnListData =
((alloc != nullptr) && (num_agents_accessible != nullptr) && (accessible != nullptr));
if (returnListData) {
size_t max_agents = cpu_agents_.size() + gpu_agents_.size();
mappedNodes = (uint32_t*)alloca(max_agents * sizeof(uint32_t));
// memory_lock protects access to the NMappedNodes array since this changes with calls to memory
// APIs.
{ // memory_lock protects access to the NMappedNodes array and fragment user data since these may
// change with calls to memory APIs.
ScopedAcquire<KernelMutex> lock(&memory_lock_);
hsaKmtQueryPointerInfo(ptr, &thunkInfo);
assert(thunkInfo.NMappedNodes <= max_agents &&
"PointerInfo: Thunk returned more than all agents in NMappedNodes.");
memcpy(mappedNodes, thunkInfo.MappedNodes, thunkInfo.NMappedNodes * sizeof(uint32_t));
} else {
hsaKmtQueryPointerInfo(ptr, &thunkInfo);
}
static_assert((int)HSA_POINTER_UNKNOWN == (int)HSA_EXT_POINTER_TYPE_UNKNOWN,
"Thunk pointer info mismatch");
static_assert((int)HSA_POINTER_ALLOCATED == (int)HSA_EXT_POINTER_TYPE_HSA,
"Thunk pointer info mismatch");
static_assert((int)HSA_POINTER_REGISTERED_USER == (int)HSA_EXT_POINTER_TYPE_LOCKED,
"Thunk pointer info mismatch");
static_assert((int)HSA_POINTER_REGISTERED_GRAPHICS == (int)HSA_EXT_POINTER_TYPE_GRAPHICS,
"Thunk pointer info mismatch");
if (returnListData) {
assert(thunkInfo.NMappedNodes <= agents_by_node_.size() &&
"PointerInfo: Thunk returned more than all agents in NMappedNodes.");
mappedNodes = (uint32_t*)alloca(thunkInfo.NMappedNodes * sizeof(uint32_t));
memcpy(mappedNodes, thunkInfo.MappedNodes, thunkInfo.NMappedNodes * sizeof(uint32_t));
}
retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type;
retInfo.agentBaseAddress = reinterpret_cast<void*>(thunkInfo.GPUAddress);
retInfo.hostBaseAddress = thunkInfo.CPUAddress;
retInfo.sizeInBytes = thunkInfo.SizeInBytes;
retInfo.userData = thunkInfo.UserData;
if (block_info != nullptr) {
block_info->base = retInfo.hostBaseAddress;
block_info->length = retInfo.sizeInBytes;
}
if (retInfo.type == HSA_EXT_POINTER_TYPE_HSA) {
auto fragment = allocation_map_.upper_bound(ptr);
if (fragment != allocation_map_.begin()) {
fragment--;
if ((fragment->first <= ptr) &&
(ptr < reinterpret_cast<const uint8_t*>(fragment->first) + fragment->second.size)) {
retInfo.hostBaseAddress = const_cast<void*>(fragment->first);
retInfo.agentBaseAddress = retInfo.hostBaseAddress;
retInfo.sizeInBytes = fragment->second.size;
retInfo.userData = fragment->second.user_ptr;
}
}
}
} // end lock scope
retInfo.size = Min(info->size, sizeof(hsa_amd_pointer_info_t));
retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type;
retInfo.agentBaseAddress = reinterpret_cast<void*>(thunkInfo.GPUAddress);
retInfo.hostBaseAddress = thunkInfo.CPUAddress;
retInfo.sizeInBytes = thunkInfo.SizeInBytes;
retInfo.userData = thunkInfo.UserData;
retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
// Temp: workaround thunk bug, IPC memory has garbage in Node.
// retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
auto it = agents_by_node_.find(thunkInfo.Node);
if (it != agents_by_node_.end())
retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
else
retInfo.agentOwner.handle = 0;
memcpy(info, &retInfo, retInfo.size);
@@ -751,19 +778,50 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
}
hsa_status_t Runtime::SetPtrInfoData(void* ptr, void* userptr) {
{ // Use allocation map if possible to handle fragments.
ScopedAcquire<KernelMutex> lock(&memory_lock_);
const auto& it = allocation_map_.find(ptr);
if (it != allocation_map_.end()) {
it->second.user_ptr = userptr;
return HSA_STATUS_SUCCESS;
}
}
// Cover entries not in the allocation map (graphics, lock,...)
if (hsaKmtSetMemoryUserData(ptr, userptr) == HSAKMT_STATUS_SUCCESS)
return HSA_STATUS_SUCCESS;
else
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* handle) {
static_assert(sizeof(hsa_amd_ipc_memory_t) == sizeof(HsaSharedMemoryHandle),
"Thunk IPC mismatch.");
if (hsaKmtShareMemory(ptr, len, (HsaSharedMemoryHandle*)handle) == HSAKMT_STATUS_SUCCESS)
return HSA_STATUS_SUCCESS;
else
// Reject sharing allocations larger than ~8TB due to thunk limitations.
if (len > 0x7FFFFFFF000ull) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
// Check for fragment sharing.
PtrInfoBlockData block;
hsa_amd_pointer_info_t info;
info.size = sizeof(info);
if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
if ((block.base != ptr) || (block.length != len)) {
if (!IsMultipleOf(block.base, 2 * 1024 * 1024)) {
assert(false && "Fragment's block not aligned to 2MB!");
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
if (hsaKmtShareMemory(block.base, block.length, reinterpret_cast<HsaSharedMemoryHandle*>(
handle)) != HSAKMT_STATUS_SUCCESS)
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
uint32_t offset =
(reinterpret_cast<uint8_t*>(ptr) - reinterpret_cast<uint8_t*>(block.base)) / 4096;
// Holds size in (4K?) pages in thunk handle: Mark as a fragment and denote offset.
handle->handle[6] |= 0x80000000 | offset;
} else {
if (hsaKmtShareMemory(ptr, len, reinterpret_cast<HsaSharedMemoryHandle*>(handle)) !=
HSAKMT_STATUS_SUCCESS)
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, uint32_t num_agents,
@@ -772,14 +830,36 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
void* importAddress;
HSAuint64 importSize;
HSAuint64 altAddress;
hsa_amd_ipc_memory_t importHandle;
importHandle = *handle;
// Extract fragment info
bool isFragment = false;
uint32_t fragOffset = 0;
auto fixFragment = [&]() {
if (!isFragment) return;
importAddress = reinterpret_cast<uint8_t*>(importAddress) + fragOffset;
len = Min(len, importSize - fragOffset);
ScopedAcquire<KernelMutex> lock(&memory_lock_);
allocation_map_[importAddress] = AllocationRegion(nullptr, len);
};
if ((importHandle.handle[6] & 0x80000000) != 0) {
isFragment = true;
fragOffset = (importHandle.handle[6] & 0x1FF) * 4096;
importHandle.handle[6] &= ~(0x80000000 | 0x1FF);
}
if (num_agents == 0) {
if (hsaKmtRegisterSharedHandle(reinterpret_cast<const HsaSharedMemoryHandle*>(handle),
if (hsaKmtRegisterSharedHandle(reinterpret_cast<const HsaSharedMemoryHandle*>(&importHandle),
&importAddress, &importSize) != HSAKMT_STATUS_SUCCESS)
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
if (hsaKmtMapMemoryToGPU(importAddress, importSize, &altAddress) != HSAKMT_STATUS_SUCCESS) {
hsaKmtDeregisterMemory(importAddress);
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
fixFragment();
*mapped_ptr = importAddress;
return HSA_STATUS_SUCCESS;
}
@@ -798,9 +878,9 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
for (int i = 0; i < num_agents; i++)
agents[i]->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_DRIVER_NODE_ID, &nodes[i]);
if (hsaKmtRegisterSharedHandleToNodes(reinterpret_cast<const HsaSharedMemoryHandle*>(handle),
&importAddress, &importSize, num_agents,
nodes) != HSAKMT_STATUS_SUCCESS)
if (hsaKmtRegisterSharedHandleToNodes(
reinterpret_cast<const HsaSharedMemoryHandle*>(&importHandle), &importAddress,
&importSize, num_agents, nodes) != HSAKMT_STATUS_SUCCESS)
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
HsaMemMapFlags map_flags;
@@ -816,11 +896,28 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
}
}
fixFragment();
*mapped_ptr = importAddress;
return HSA_STATUS_SUCCESS;
}
hsa_status_t Runtime::IPCDetach(void* ptr) {
{ // Handle imported fragments.
ScopedAcquire<KernelMutex> lock(&memory_lock_);
const auto& it = allocation_map_.find(ptr);
if (it != allocation_map_.end()) {
if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
allocation_map_.erase(it);
lock.Release(); // Can't hold memory lock when using pointer info.
PtrInfoBlockData block;
hsa_amd_pointer_info_t info;
info.size = sizeof(info);
if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
ptr = block.base;
}
}
if (hsaKmtUnmapMemoryToGPU(ptr) != HSAKMT_STATUS_SUCCESS)
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
if (hsaKmtDeregisterMemory(ptr) != HSAKMT_STATUS_SUCCESS)
+8 -1
Просмотреть файл
@@ -59,7 +59,14 @@ class ScopedAcquire {
explicit ScopedAcquire(LockType* lock) : lock_(lock) { lock_->Acquire(); }
/// @brief: when destructing, release the lock.
~ScopedAcquire() { lock_->Release(); }
~ScopedAcquire() {
if (lock_ != nullptr) lock_->Release();
}
void Release() {
lock_->Release();
lock_ = nullptr;
}
private:
LockType* lock_;
+247
Просмотреть файл
@@ -0,0 +1,247 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
// A simple best fit memory allocator with eager compaction. Manages block sub-allocation.
// For use when memory efficiency is more important than allocation speed.
// O(log n) time.
#ifndef HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
#define HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
#include <map>
#include <deque>
#include <utility>
#include "core/util/utils.h"
template <typename Allocator> class SimpleHeap {
private:
struct Fragment_T {
typedef std::multimap<size_t, uintptr_t>::iterator ptr_t;
ptr_t free_list_entry_;
size_t size;
Fragment_T(ptr_t Iterator, size_t Len) : free_list_entry_(Iterator), size(Len) {}
Fragment_T() = default;
};
struct Block {
uintptr_t base_ptr_;
size_t length_;
Block(uintptr_t base, size_t length) : base_ptr_(base), length_(length) {}
Block() = default;
};
Allocator block_allocator_;
std::multimap<size_t, uintptr_t> free_list_;
std::map<uintptr_t, std::map<uintptr_t, Fragment_T>> block_list_;
std::deque<Block> block_cache_;
size_t in_use_size_;
size_t cache_size_;
__forceinline bool isFree(const Fragment_T& node) {
return node.free_list_entry_ != free_list_.end();
}
__forceinline void setUsed(Fragment_T& node) { node.free_list_entry_ = free_list_.end(); }
__forceinline void setFree(Fragment_T& node, typename Fragment_T::ptr_t Iterator) {
node.free_list_entry_ = Iterator;
}
__forceinline Fragment_T makeFragment(size_t Len) { return Fragment_T(free_list_.end(), Len); }
__forceinline Fragment_T makeFragment(typename Fragment_T::ptr_t Iterator, size_t Len) {
return Fragment_T(Iterator, Len);
}
public:
explicit SimpleHeap(const Allocator& BlockAllocator = Allocator())
: block_allocator_(BlockAllocator), in_use_size_(0), cache_size_(0) {}
~SimpleHeap() {
trim();
// Leak here may be due to the user. Check is for debugging only.
// assert(in_use_size_ == 0 && "Leak in SimpleHeap.");
}
SimpleHeap(const SimpleHeap& rhs) = delete;
SimpleHeap(SimpleHeap&& rhs) = delete;
SimpleHeap& operator=(const SimpleHeap& rhs) = delete;
SimpleHeap& operator=(SimpleHeap&& rhs) = delete;
void* alloc(size_t bytes) {
if (bytes > max_alloc()) {
assert(false && "Requested allocation is larger than block size.");
throw std::bad_alloc();
return nullptr;
}
// Find best fit.
auto free_fragment = free_list_.lower_bound(bytes);
uintptr_t base;
size_t size;
if (free_fragment != free_list_.end()) {
base = free_fragment->second;
size = free_fragment->first;
free_list_.erase(free_fragment);
assert(size >= bytes && "SimpleHeap: map lower_bound failure.");
// Find the containing block and fragment
auto it = block_list_.upper_bound(base);
it--;
auto& frag_map = it->second;
const auto& fragment = frag_map.find(base);
assert(fragment != frag_map.end() && "Inconsistency in SimpleHeap.");
assert(size == fragment->second.size && "Inconsistency in SimpleHeap.");
// Sub-allocate from fragment.
fragment->second.size = bytes;
setUsed(fragment->second);
// Record remaining free space.
if (size > bytes) {
free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
frag_map[base + bytes] = makeFragment(free_fragment, size - bytes);
}
return reinterpret_cast<void*>(base);
}
// No usable fragment, check block cache
if (!block_cache_.empty()) {
const auto& block = block_cache_.back();
base = block.base_ptr_;
size = block.length_;
block_cache_.pop_back();
cache_size_ -= size;
} else { // Alloc new block
void* ptr = block_allocator_.alloc(bytes, size);
base = reinterpret_cast<uintptr_t>(ptr);
assert(ptr != nullptr && "Block allocation failed, Allocator is expected to throw.");
}
in_use_size_ += size;
assert(size >= bytes && "Alloc exceeds block size.");
// Sub alloc and insert free region.
if (size > bytes) {
free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
block_list_[base][base + bytes] = makeFragment(free_fragment, size - bytes);
}
// Track used region
block_list_[base][base] = makeFragment(bytes);
return reinterpret_cast<void*>(base);
}
bool free(void* ptr) {
if (ptr == nullptr) return true;
uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
// Find fragment and validate.
auto frag_map_it = block_list_.upper_bound(base);
if (frag_map_it == block_list_.begin()) return false;
frag_map_it--;
auto& frag_map = frag_map_it->second;
auto fragment = frag_map.find(base);
if (fragment == frag_map.end() || isFree(fragment->second)) return false;
// Merge lower
if (fragment != frag_map.begin()) {
auto lower = fragment;
lower--;
if (isFree(lower->second)) {
free_list_.erase(lower->second.free_list_entry_);
lower->second.size += fragment->second.size;
frag_map.erase(fragment);
fragment = lower;
}
}
// Merge upper
{
auto upper = fragment;
upper++;
if ((upper != frag_map.end()) && isFree(upper->second)) {
free_list_.erase(upper->second.free_list_entry_);
fragment->second.size += upper->second.size;
frag_map.erase(upper);
}
}
// Move whole free blocks to block cache
if (frag_map.size() == 1) {
in_use_size_ -= fragment->second.size;
cache_size_ += fragment->second.size;
block_cache_.push_back(Block(fragment->first, fragment->second.size));
block_list_.erase(frag_map_it);
// Release old blocks when over cache limit.
while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) {
const auto& block = block_cache_.front();
block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
cache_size_ -= block.length_;
block_cache_.pop_front();
}
// Don't publish free space since block was moved to the cache.
return true;
}
// Report free fragment
const auto& freeEntry =
free_list_.insert(std::make_pair(fragment->second.size, fragment->first));
setFree(fragment->second, freeEntry);
return true;
}
void trim() {
for (const auto& block : block_cache_)
block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
block_cache_.clear();
}
size_t max_alloc() const { return block_allocator_.block_size(); }
};
#endif // HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
+1 -1
Просмотреть файл
@@ -1488,7 +1488,7 @@ typedef struct hsa_amd_ipc_memory_s {
* any process. In general applications should confirm that a shared memory
* region has been attached (via hsa_amd_ipc_memory_attach) in the remote
* process prior to releasing that memory in the local process.
* Repeated calls for the same allocaiton may, but are not required to, return
* Repeated calls for the same allocation may, but are not required to, return
* unique handles.
*
* @param[in] ptr Pointer to memory allocated via ROCr APIs to prepare for