Add suballocator for ordinary VRAM allocations smaller than 2MB.
Track pointer info for sub 2MB fragment allocations in allocation_map_. Add fragment support to IPC. Change-Id: I00cfc2e2fa289aac90a4718c392f9bb056a61a87
Этот коммит содержится в:
@@ -49,6 +49,7 @@
|
||||
|
||||
#include "core/inc/agent.h"
|
||||
#include "core/inc/memory_region.h"
|
||||
#include "core/util/simple_heap.h"
|
||||
|
||||
#include "inc/hsa_ext_amd.h"
|
||||
|
||||
@@ -181,7 +182,21 @@ class MemoryRegion : public core::MemoryRegion {
|
||||
HSAuint64 virtual_size_;
|
||||
|
||||
static const size_t kPageSize_ = 4096;
|
||||
|
||||
class BlockAllocator {
|
||||
private:
|
||||
MemoryRegion& region_;
|
||||
static const size_t block_size_ = 2 * 1024 * 1024; // 2MB blocks.
|
||||
public:
|
||||
explicit BlockAllocator(MemoryRegion& region) : region_(region) {}
|
||||
void* alloc(size_t request_size, size_t& allocated_size) const;
|
||||
void free(void* ptr, size_t length) const { region_.Free(ptr, length); }
|
||||
size_t block_size() const { return block_size_; }
|
||||
};
|
||||
|
||||
mutable SimpleHeap<BlockAllocator> fragment_allocator_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // header guard
|
||||
|
||||
@@ -85,6 +85,7 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
|
||||
AllocateRestrict = (1 << 0), // Don't map system memory to GPU agents
|
||||
AllocateExecutable = (1 << 1), // Set executable permission
|
||||
AllocateDoubleMap = (1 << 2), // Map twice VA allocation to backing store
|
||||
AllocateDirect = (1 << 3), // Bypass fragment cache.
|
||||
};
|
||||
|
||||
typedef uint32_t AllocateFlags;
|
||||
|
||||
@@ -264,8 +264,14 @@ class Runtime {
|
||||
|
||||
hsa_status_t InteropUnmap(void* ptr);
|
||||
|
||||
struct PtrInfoBlockData {
|
||||
void* base;
|
||||
size_t length;
|
||||
};
|
||||
|
||||
hsa_status_t PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t),
|
||||
uint32_t* num_agents_accessible, hsa_agent_t** accessible);
|
||||
uint32_t* num_agents_accessible, hsa_agent_t** accessible,
|
||||
PtrInfoBlockData* block_info = nullptr);
|
||||
|
||||
hsa_status_t SetPtrInfoData(void* ptr, void* userptr);
|
||||
|
||||
@@ -315,12 +321,13 @@ class Runtime {
|
||||
static void AsyncEventsLoop(void*);
|
||||
|
||||
struct AllocationRegion {
|
||||
AllocationRegion() : region(NULL), size(0) {}
|
||||
AllocationRegion() : region(NULL), size(0), user_ptr(nullptr) {}
|
||||
AllocationRegion(const MemoryRegion* region_arg, size_t size_arg)
|
||||
: region(region_arg), size(size_arg) {}
|
||||
: region(region_arg), size(size_arg), user_ptr(nullptr) {}
|
||||
|
||||
const MemoryRegion* region;
|
||||
size_t size;
|
||||
void* user_ptr;
|
||||
};
|
||||
|
||||
struct AsyncEventsControl {
|
||||
|
||||
@@ -49,6 +49,7 @@
|
||||
#include "core/inc/amd_cpu_agent.h"
|
||||
#include "core/inc/amd_gpu_agent.h"
|
||||
#include "core/util/utils.h"
|
||||
#include "core/inc/exceptions.h"
|
||||
|
||||
namespace amd {
|
||||
void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag,
|
||||
@@ -98,13 +99,13 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) {
|
||||
hsaKmtUnmapMemoryToGPU(const_cast<void*>(ptr));
|
||||
}
|
||||
|
||||
MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile,
|
||||
core::Agent* owner,
|
||||
MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner,
|
||||
const HsaMemoryProperties& mem_props)
|
||||
: core::MemoryRegion(fine_grain, full_profile, owner),
|
||||
mem_props_(mem_props),
|
||||
max_single_alloc_size_(0),
|
||||
virtual_size_(0) {
|
||||
virtual_size_(0),
|
||||
fragment_allocator_(BlockAllocator(*this)) {
|
||||
virtual_size_ = GetPhysicalSize();
|
||||
|
||||
mem_flag_.Value = 0;
|
||||
@@ -169,6 +170,15 @@ hsa_status_t MemoryRegion::Allocate(size_t size, AllocateFlags alloc_flags,
|
||||
kmt_alloc_flags.ui32.AQLQueueMemory =
|
||||
(alloc_flags & AllocateDoubleMap ? 1 : 0);
|
||||
|
||||
// Only allow using the suballocator for ordinary VRAM.
|
||||
bool useSubAlloc = IsLocalMemory();
|
||||
useSubAlloc &= (alloc_flags == AllocateRestrict);
|
||||
useSubAlloc &= (size <= fragment_allocator_.max_alloc());
|
||||
if (useSubAlloc) {
|
||||
*address = fragment_allocator_.alloc(size);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
|
||||
|
||||
if (*address != NULL) {
|
||||
@@ -220,6 +230,8 @@ hsa_status_t MemoryRegion::Allocate(size_t size, AllocateFlags alloc_flags,
|
||||
}
|
||||
|
||||
hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
|
||||
if (fragment_allocator_.free(address)) return HSA_STATUS_SUCCESS;
|
||||
|
||||
MakeKfdMemoryUnresident(address);
|
||||
|
||||
FreeKfdMemory(address, size);
|
||||
@@ -586,4 +598,19 @@ hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size,
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const {
|
||||
assert(request_size < block_size() && "BlockAllocator alloc request exceeds block size.");
|
||||
|
||||
void* ret;
|
||||
hsa_status_t err = region_.Allocate(
|
||||
block_size(), core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect,
|
||||
&ret);
|
||||
if (err != HSA_STATUS_SUCCESS)
|
||||
throw new ::AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed.");
|
||||
assert(ret != nullptr && "Region returned nullptr on success.");
|
||||
|
||||
allocated_size = block_size();
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
@@ -321,11 +321,11 @@ hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::FreeMemory(void* ptr) {
|
||||
if (ptr == NULL) {
|
||||
if (ptr == nullptr) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
const MemoryRegion* region = NULL;
|
||||
const MemoryRegion* region = nullptr;
|
||||
size_t size = 0;
|
||||
ScopedAcquire<KernelMutex> lock(&memory_lock_);
|
||||
|
||||
@@ -333,11 +333,14 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
|
||||
|
||||
if (it == allocation_map_.end()) {
|
||||
assert(false && "Can't find address in allocation map");
|
||||
return HSA_STATUS_ERROR;
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
region = it->second.region;
|
||||
size = it->second.size;
|
||||
|
||||
// Imported fragments can't be released with FreeMemory.
|
||||
if (region == nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
allocation_map_.erase(it);
|
||||
|
||||
return region->Free(ptr, size);
|
||||
@@ -681,7 +684,17 @@ hsa_status_t Runtime::InteropUnmap(void* ptr) {
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t),
|
||||
uint32_t* num_agents_accessible, hsa_agent_t** accessible) {
|
||||
uint32_t* num_agents_accessible, hsa_agent_t** accessible,
|
||||
PtrInfoBlockData* block_info) {
|
||||
static_assert(static_cast<int>(HSA_POINTER_UNKNOWN) == static_cast<int>(HSA_EXT_POINTER_TYPE_UNKNOWN),
|
||||
"Thunk pointer info mismatch");
|
||||
static_assert(static_cast<int>(HSA_POINTER_ALLOCATED) == static_cast<int>(HSA_EXT_POINTER_TYPE_HSA),
|
||||
"Thunk pointer info mismatch");
|
||||
static_assert(static_cast<int>(HSA_POINTER_REGISTERED_USER) == static_cast<int>(HSA_EXT_POINTER_TYPE_LOCKED),
|
||||
"Thunk pointer info mismatch");
|
||||
static_assert(static_cast<int>(HSA_POINTER_REGISTERED_GRAPHICS) == static_cast<int>(HSA_EXT_POINTER_TYPE_GRAPHICS),
|
||||
"Thunk pointer info mismatch");
|
||||
|
||||
HsaPointerInfo thunkInfo;
|
||||
uint32_t* mappedNodes;
|
||||
|
||||
@@ -692,36 +705,50 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
|
||||
|
||||
bool returnListData =
|
||||
((alloc != nullptr) && (num_agents_accessible != nullptr) && (accessible != nullptr));
|
||||
if (returnListData) {
|
||||
size_t max_agents = cpu_agents_.size() + gpu_agents_.size();
|
||||
mappedNodes = (uint32_t*)alloca(max_agents * sizeof(uint32_t));
|
||||
// memory_lock protects access to the NMappedNodes array since this changes with calls to memory
|
||||
// APIs.
|
||||
|
||||
{ // memory_lock protects access to the NMappedNodes array and fragment user data since these may
|
||||
// change with calls to memory APIs.
|
||||
ScopedAcquire<KernelMutex> lock(&memory_lock_);
|
||||
hsaKmtQueryPointerInfo(ptr, &thunkInfo);
|
||||
assert(thunkInfo.NMappedNodes <= max_agents &&
|
||||
"PointerInfo: Thunk returned more than all agents in NMappedNodes.");
|
||||
memcpy(mappedNodes, thunkInfo.MappedNodes, thunkInfo.NMappedNodes * sizeof(uint32_t));
|
||||
} else {
|
||||
hsaKmtQueryPointerInfo(ptr, &thunkInfo);
|
||||
}
|
||||
|
||||
static_assert((int)HSA_POINTER_UNKNOWN == (int)HSA_EXT_POINTER_TYPE_UNKNOWN,
|
||||
"Thunk pointer info mismatch");
|
||||
static_assert((int)HSA_POINTER_ALLOCATED == (int)HSA_EXT_POINTER_TYPE_HSA,
|
||||
"Thunk pointer info mismatch");
|
||||
static_assert((int)HSA_POINTER_REGISTERED_USER == (int)HSA_EXT_POINTER_TYPE_LOCKED,
|
||||
"Thunk pointer info mismatch");
|
||||
static_assert((int)HSA_POINTER_REGISTERED_GRAPHICS == (int)HSA_EXT_POINTER_TYPE_GRAPHICS,
|
||||
"Thunk pointer info mismatch");
|
||||
if (returnListData) {
|
||||
assert(thunkInfo.NMappedNodes <= agents_by_node_.size() &&
|
||||
"PointerInfo: Thunk returned more than all agents in NMappedNodes.");
|
||||
mappedNodes = (uint32_t*)alloca(thunkInfo.NMappedNodes * sizeof(uint32_t));
|
||||
memcpy(mappedNodes, thunkInfo.MappedNodes, thunkInfo.NMappedNodes * sizeof(uint32_t));
|
||||
}
|
||||
retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type;
|
||||
retInfo.agentBaseAddress = reinterpret_cast<void*>(thunkInfo.GPUAddress);
|
||||
retInfo.hostBaseAddress = thunkInfo.CPUAddress;
|
||||
retInfo.sizeInBytes = thunkInfo.SizeInBytes;
|
||||
retInfo.userData = thunkInfo.UserData;
|
||||
if (block_info != nullptr) {
|
||||
block_info->base = retInfo.hostBaseAddress;
|
||||
block_info->length = retInfo.sizeInBytes;
|
||||
}
|
||||
if (retInfo.type == HSA_EXT_POINTER_TYPE_HSA) {
|
||||
auto fragment = allocation_map_.upper_bound(ptr);
|
||||
if (fragment != allocation_map_.begin()) {
|
||||
fragment--;
|
||||
if ((fragment->first <= ptr) &&
|
||||
(ptr < reinterpret_cast<const uint8_t*>(fragment->first) + fragment->second.size)) {
|
||||
retInfo.hostBaseAddress = const_cast<void*>(fragment->first);
|
||||
retInfo.agentBaseAddress = retInfo.hostBaseAddress;
|
||||
retInfo.sizeInBytes = fragment->second.size;
|
||||
retInfo.userData = fragment->second.user_ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // end lock scope
|
||||
|
||||
retInfo.size = Min(info->size, sizeof(hsa_amd_pointer_info_t));
|
||||
retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type;
|
||||
retInfo.agentBaseAddress = reinterpret_cast<void*>(thunkInfo.GPUAddress);
|
||||
retInfo.hostBaseAddress = thunkInfo.CPUAddress;
|
||||
retInfo.sizeInBytes = thunkInfo.SizeInBytes;
|
||||
retInfo.userData = thunkInfo.UserData;
|
||||
retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
|
||||
|
||||
// Temp: workaround thunk bug, IPC memory has garbage in Node.
|
||||
// retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
|
||||
auto it = agents_by_node_.find(thunkInfo.Node);
|
||||
if (it != agents_by_node_.end())
|
||||
retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
|
||||
else
|
||||
retInfo.agentOwner.handle = 0;
|
||||
|
||||
memcpy(info, &retInfo, retInfo.size);
|
||||
|
||||
@@ -751,19 +778,50 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::SetPtrInfoData(void* ptr, void* userptr) {
|
||||
{ // Use allocation map if possible to handle fragments.
|
||||
ScopedAcquire<KernelMutex> lock(&memory_lock_);
|
||||
const auto& it = allocation_map_.find(ptr);
|
||||
if (it != allocation_map_.end()) {
|
||||
it->second.user_ptr = userptr;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
// Cover entries not in the allocation map (graphics, lock,...)
|
||||
if (hsaKmtSetMemoryUserData(ptr, userptr) == HSAKMT_STATUS_SUCCESS)
|
||||
return HSA_STATUS_SUCCESS;
|
||||
else
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* handle) {
|
||||
static_assert(sizeof(hsa_amd_ipc_memory_t) == sizeof(HsaSharedMemoryHandle),
|
||||
"Thunk IPC mismatch.");
|
||||
if (hsaKmtShareMemory(ptr, len, (HsaSharedMemoryHandle*)handle) == HSAKMT_STATUS_SUCCESS)
|
||||
return HSA_STATUS_SUCCESS;
|
||||
else
|
||||
// Reject sharing allocations larger than ~8TB due to thunk limitations.
|
||||
if (len > 0x7FFFFFFF000ull) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
// Check for fragment sharing.
|
||||
PtrInfoBlockData block;
|
||||
hsa_amd_pointer_info_t info;
|
||||
info.size = sizeof(info);
|
||||
if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
if ((block.base != ptr) || (block.length != len)) {
|
||||
if (!IsMultipleOf(block.base, 2 * 1024 * 1024)) {
|
||||
assert(false && "Fragment's block not aligned to 2MB!");
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
if (hsaKmtShareMemory(block.base, block.length, reinterpret_cast<HsaSharedMemoryHandle*>(
|
||||
handle)) != HSAKMT_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
uint32_t offset =
|
||||
(reinterpret_cast<uint8_t*>(ptr) - reinterpret_cast<uint8_t*>(block.base)) / 4096;
|
||||
// Holds size in (4K?) pages in thunk handle: Mark as a fragment and denote offset.
|
||||
handle->handle[6] |= 0x80000000 | offset;
|
||||
} else {
|
||||
if (hsaKmtShareMemory(ptr, len, reinterpret_cast<HsaSharedMemoryHandle*>(handle)) !=
|
||||
HSAKMT_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, uint32_t num_agents,
|
||||
@@ -772,14 +830,36 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
|
||||
void* importAddress;
|
||||
HSAuint64 importSize;
|
||||
HSAuint64 altAddress;
|
||||
|
||||
hsa_amd_ipc_memory_t importHandle;
|
||||
importHandle = *handle;
|
||||
|
||||
// Extract fragment info
|
||||
bool isFragment = false;
|
||||
uint32_t fragOffset = 0;
|
||||
auto fixFragment = [&]() {
|
||||
if (!isFragment) return;
|
||||
importAddress = reinterpret_cast<uint8_t*>(importAddress) + fragOffset;
|
||||
len = Min(len, importSize - fragOffset);
|
||||
ScopedAcquire<KernelMutex> lock(&memory_lock_);
|
||||
allocation_map_[importAddress] = AllocationRegion(nullptr, len);
|
||||
};
|
||||
|
||||
if ((importHandle.handle[6] & 0x80000000) != 0) {
|
||||
isFragment = true;
|
||||
fragOffset = (importHandle.handle[6] & 0x1FF) * 4096;
|
||||
importHandle.handle[6] &= ~(0x80000000 | 0x1FF);
|
||||
}
|
||||
|
||||
if (num_agents == 0) {
|
||||
if (hsaKmtRegisterSharedHandle(reinterpret_cast<const HsaSharedMemoryHandle*>(handle),
|
||||
if (hsaKmtRegisterSharedHandle(reinterpret_cast<const HsaSharedMemoryHandle*>(&importHandle),
|
||||
&importAddress, &importSize) != HSAKMT_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
if (hsaKmtMapMemoryToGPU(importAddress, importSize, &altAddress) != HSAKMT_STATUS_SUCCESS) {
|
||||
hsaKmtDeregisterMemory(importAddress);
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
fixFragment();
|
||||
*mapped_ptr = importAddress;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -798,9 +878,9 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
|
||||
for (int i = 0; i < num_agents; i++)
|
||||
agents[i]->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_DRIVER_NODE_ID, &nodes[i]);
|
||||
|
||||
if (hsaKmtRegisterSharedHandleToNodes(reinterpret_cast<const HsaSharedMemoryHandle*>(handle),
|
||||
&importAddress, &importSize, num_agents,
|
||||
nodes) != HSAKMT_STATUS_SUCCESS)
|
||||
if (hsaKmtRegisterSharedHandleToNodes(
|
||||
reinterpret_cast<const HsaSharedMemoryHandle*>(&importHandle), &importAddress,
|
||||
&importSize, num_agents, nodes) != HSAKMT_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
HsaMemMapFlags map_flags;
|
||||
@@ -816,11 +896,28 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
|
||||
}
|
||||
}
|
||||
|
||||
fixFragment();
|
||||
*mapped_ptr = importAddress;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::IPCDetach(void* ptr) {
|
||||
{ // Handle imported fragments.
|
||||
ScopedAcquire<KernelMutex> lock(&memory_lock_);
|
||||
const auto& it = allocation_map_.find(ptr);
|
||||
if (it != allocation_map_.end()) {
|
||||
if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
allocation_map_.erase(it);
|
||||
lock.Release(); // Can't hold memory lock when using pointer info.
|
||||
|
||||
PtrInfoBlockData block;
|
||||
hsa_amd_pointer_info_t info;
|
||||
info.size = sizeof(info);
|
||||
if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
ptr = block.base;
|
||||
}
|
||||
}
|
||||
if (hsaKmtUnmapMemoryToGPU(ptr) != HSAKMT_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
if (hsaKmtDeregisterMemory(ptr) != HSAKMT_STATUS_SUCCESS)
|
||||
|
||||
@@ -59,7 +59,14 @@ class ScopedAcquire {
|
||||
explicit ScopedAcquire(LockType* lock) : lock_(lock) { lock_->Acquire(); }
|
||||
|
||||
/// @brief: when destructing, release the lock.
|
||||
~ScopedAcquire() { lock_->Release(); }
|
||||
~ScopedAcquire() {
|
||||
if (lock_ != nullptr) lock_->Release();
|
||||
}
|
||||
|
||||
void Release() {
|
||||
lock_->Release();
|
||||
lock_ = nullptr;
|
||||
}
|
||||
|
||||
private:
|
||||
LockType* lock_;
|
||||
|
||||
@@ -0,0 +1,247 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// A simple best fit memory allocator with eager compaction. Manages block sub-allocation.
|
||||
// For use when memory efficiency is more important than allocation speed.
|
||||
// O(log n) time.
|
||||
|
||||
#ifndef HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
|
||||
#define HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
|
||||
|
||||
#include <map>
|
||||
#include <deque>
|
||||
#include <utility>
|
||||
|
||||
#include "core/util/utils.h"
|
||||
|
||||
template <typename Allocator> class SimpleHeap {
|
||||
private:
|
||||
struct Fragment_T {
|
||||
typedef std::multimap<size_t, uintptr_t>::iterator ptr_t;
|
||||
ptr_t free_list_entry_;
|
||||
size_t size;
|
||||
|
||||
Fragment_T(ptr_t Iterator, size_t Len) : free_list_entry_(Iterator), size(Len) {}
|
||||
Fragment_T() = default;
|
||||
};
|
||||
|
||||
struct Block {
|
||||
uintptr_t base_ptr_;
|
||||
size_t length_;
|
||||
|
||||
Block(uintptr_t base, size_t length) : base_ptr_(base), length_(length) {}
|
||||
Block() = default;
|
||||
};
|
||||
|
||||
Allocator block_allocator_;
|
||||
|
||||
std::multimap<size_t, uintptr_t> free_list_;
|
||||
std::map<uintptr_t, std::map<uintptr_t, Fragment_T>> block_list_;
|
||||
std::deque<Block> block_cache_;
|
||||
|
||||
size_t in_use_size_;
|
||||
size_t cache_size_;
|
||||
|
||||
__forceinline bool isFree(const Fragment_T& node) {
|
||||
return node.free_list_entry_ != free_list_.end();
|
||||
}
|
||||
__forceinline void setUsed(Fragment_T& node) { node.free_list_entry_ = free_list_.end(); }
|
||||
__forceinline void setFree(Fragment_T& node, typename Fragment_T::ptr_t Iterator) {
|
||||
node.free_list_entry_ = Iterator;
|
||||
}
|
||||
__forceinline Fragment_T makeFragment(size_t Len) { return Fragment_T(free_list_.end(), Len); }
|
||||
__forceinline Fragment_T makeFragment(typename Fragment_T::ptr_t Iterator, size_t Len) {
|
||||
return Fragment_T(Iterator, Len);
|
||||
}
|
||||
|
||||
public:
|
||||
explicit SimpleHeap(const Allocator& BlockAllocator = Allocator())
|
||||
: block_allocator_(BlockAllocator), in_use_size_(0), cache_size_(0) {}
|
||||
~SimpleHeap() {
|
||||
trim();
|
||||
// Leak here may be due to the user. Check is for debugging only.
|
||||
// assert(in_use_size_ == 0 && "Leak in SimpleHeap.");
|
||||
}
|
||||
|
||||
SimpleHeap(const SimpleHeap& rhs) = delete;
|
||||
SimpleHeap(SimpleHeap&& rhs) = delete;
|
||||
SimpleHeap& operator=(const SimpleHeap& rhs) = delete;
|
||||
SimpleHeap& operator=(SimpleHeap&& rhs) = delete;
|
||||
|
||||
void* alloc(size_t bytes) {
|
||||
if (bytes > max_alloc()) {
|
||||
assert(false && "Requested allocation is larger than block size.");
|
||||
throw std::bad_alloc();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Find best fit.
|
||||
auto free_fragment = free_list_.lower_bound(bytes);
|
||||
uintptr_t base;
|
||||
size_t size;
|
||||
|
||||
if (free_fragment != free_list_.end()) {
|
||||
base = free_fragment->second;
|
||||
size = free_fragment->first;
|
||||
free_list_.erase(free_fragment);
|
||||
|
||||
assert(size >= bytes && "SimpleHeap: map lower_bound failure.");
|
||||
|
||||
// Find the containing block and fragment
|
||||
auto it = block_list_.upper_bound(base);
|
||||
it--;
|
||||
auto& frag_map = it->second;
|
||||
const auto& fragment = frag_map.find(base);
|
||||
|
||||
assert(fragment != frag_map.end() && "Inconsistency in SimpleHeap.");
|
||||
assert(size == fragment->second.size && "Inconsistency in SimpleHeap.");
|
||||
|
||||
// Sub-allocate from fragment.
|
||||
fragment->second.size = bytes;
|
||||
setUsed(fragment->second);
|
||||
// Record remaining free space.
|
||||
if (size > bytes) {
|
||||
free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
|
||||
frag_map[base + bytes] = makeFragment(free_fragment, size - bytes);
|
||||
}
|
||||
return reinterpret_cast<void*>(base);
|
||||
}
|
||||
|
||||
// No usable fragment, check block cache
|
||||
if (!block_cache_.empty()) {
|
||||
const auto& block = block_cache_.back();
|
||||
base = block.base_ptr_;
|
||||
size = block.length_;
|
||||
block_cache_.pop_back();
|
||||
cache_size_ -= size;
|
||||
} else { // Alloc new block
|
||||
void* ptr = block_allocator_.alloc(bytes, size);
|
||||
base = reinterpret_cast<uintptr_t>(ptr);
|
||||
assert(ptr != nullptr && "Block allocation failed, Allocator is expected to throw.");
|
||||
}
|
||||
|
||||
in_use_size_ += size;
|
||||
assert(size >= bytes && "Alloc exceeds block size.");
|
||||
// Sub alloc and insert free region.
|
||||
if (size > bytes) {
|
||||
free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
|
||||
block_list_[base][base + bytes] = makeFragment(free_fragment, size - bytes);
|
||||
}
|
||||
// Track used region
|
||||
block_list_[base][base] = makeFragment(bytes);
|
||||
|
||||
return reinterpret_cast<void*>(base);
|
||||
}
|
||||
|
||||
bool free(void* ptr) {
|
||||
if (ptr == nullptr) return true;
|
||||
|
||||
uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
|
||||
|
||||
// Find fragment and validate.
|
||||
auto frag_map_it = block_list_.upper_bound(base);
|
||||
if (frag_map_it == block_list_.begin()) return false;
|
||||
frag_map_it--;
|
||||
auto& frag_map = frag_map_it->second;
|
||||
auto fragment = frag_map.find(base);
|
||||
if (fragment == frag_map.end() || isFree(fragment->second)) return false;
|
||||
|
||||
// Merge lower
|
||||
if (fragment != frag_map.begin()) {
|
||||
auto lower = fragment;
|
||||
lower--;
|
||||
if (isFree(lower->second)) {
|
||||
free_list_.erase(lower->second.free_list_entry_);
|
||||
lower->second.size += fragment->second.size;
|
||||
frag_map.erase(fragment);
|
||||
fragment = lower;
|
||||
}
|
||||
}
|
||||
|
||||
// Merge upper
|
||||
{
|
||||
auto upper = fragment;
|
||||
upper++;
|
||||
if ((upper != frag_map.end()) && isFree(upper->second)) {
|
||||
free_list_.erase(upper->second.free_list_entry_);
|
||||
fragment->second.size += upper->second.size;
|
||||
frag_map.erase(upper);
|
||||
}
|
||||
}
|
||||
|
||||
// Move whole free blocks to block cache
|
||||
if (frag_map.size() == 1) {
|
||||
in_use_size_ -= fragment->second.size;
|
||||
cache_size_ += fragment->second.size;
|
||||
block_cache_.push_back(Block(fragment->first, fragment->second.size));
|
||||
block_list_.erase(frag_map_it);
|
||||
|
||||
// Release old blocks when over cache limit.
|
||||
while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) {
|
||||
const auto& block = block_cache_.front();
|
||||
block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
|
||||
cache_size_ -= block.length_;
|
||||
block_cache_.pop_front();
|
||||
}
|
||||
|
||||
// Don't publish free space since block was moved to the cache.
|
||||
return true;
|
||||
}
|
||||
|
||||
// Report free fragment
|
||||
const auto& freeEntry =
|
||||
free_list_.insert(std::make_pair(fragment->second.size, fragment->first));
|
||||
setFree(fragment->second, freeEntry);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void trim() {
|
||||
for (const auto& block : block_cache_)
|
||||
block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
|
||||
block_cache_.clear();
|
||||
}
|
||||
|
||||
size_t max_alloc() const { return block_allocator_.block_size(); }
|
||||
};
|
||||
|
||||
#endif // HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
|
||||
@@ -1488,7 +1488,7 @@ typedef struct hsa_amd_ipc_memory_s {
|
||||
* any process. In general applications should confirm that a shared memory
|
||||
* region has been attached (via hsa_amd_ipc_memory_attach) in the remote
|
||||
* process prior to releasing that memory in the local process.
|
||||
* Repeated calls for the same allocaiton may, but are not required to, return
|
||||
* Repeated calls for the same allocation may, but are not required to, return
|
||||
* unique handles.
|
||||
*
|
||||
* @param[in] ptr Pointer to memory allocated via ROCr APIs to prepare for
|
||||
|
||||
Ссылка в новой задаче
Block a user