Files
rocm-systems/hipamd/src/hip_mempool_impl.cpp
T
Ioannis Assiouras 44b6b6813d SWDEV-503760 - Only consider allocations that are less than X% larger in a mempool request
Change-Id: I94acbca606fd4c575e2e1a9e34959ce650571867
2025-01-13 16:57:26 -05:00

528 lines
19 KiB
C++

/* Copyright (c) 2022-2023 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "hip_mempool_impl.hpp"
#include "hip_vm.hpp"
#include "platform/command.hpp"
namespace hip {
// ================================================================================================
void Heap::AddMemory(amd::Memory* memory, Stream* stream) {
auto mem_size = memory->getSize();
allocations_.insert({{mem_size, memory}, {stream}});
total_size_ += mem_size;
max_total_size_ = std::max(max_total_size_, total_size_);
}
// ================================================================================================
void Heap::AddMemory(amd::Memory* memory, const MemoryTimestamp& ts) {
auto mem_size = memory->getSize();
allocations_.insert({{mem_size, memory}, ts});
total_size_ += mem_size;
max_total_size_ = std::max(max_total_size_, total_size_);
}
// ================================================================================================
amd::Memory* Heap::FindMemory(size_t size, Stream* stream, bool opportunistic,
void* dptr, MemoryTimestamp* ts) {
amd::Memory* memory = nullptr;
auto start = allocations_.lower_bound({size, nullptr});
for (auto it = start; it != allocations_.end();) {
bool check_address = (dptr == nullptr);
if (it->first.second->getSvmPtr() == dptr) {
// If the search is done for the specified address then runtime must wait
it->second.Wait();
check_address = true;
}
// Runtime can accept an allocation with 12.5% on the size threshold
if (it->first.first > (size / 8.0) * 9) {
return nullptr;
}
// Check if size can match and it's safe to use this resource.
if (check_address && (it->second.IsSafeFind(stream, opportunistic))) {
memory = it->first.second;
total_size_ -= memory->getSize();
// Preserve event, since the logic could skip GPU wait on reuse
ts->event_ = it->second.event_;
// Remove found allocation from the map
it = allocations_.erase(it);
break;
} else {
++it;
}
}
return memory;
}
// ================================================================================================
bool Heap::RemoveMemory(amd::Memory* memory, MemoryTimestamp* ts) {
auto mem_size = memory->getSize();
if (auto it = allocations_.find({mem_size, memory}); it != allocations_.end()) {
if (ts != nullptr) {
// Preserve timestamp info for possible reuse later
*ts = it->second;
} else {
it->second.SetEvent(nullptr);
}
total_size_ -= mem_size;
allocations_.erase(it);
return true;
}
return false;
}
// ================================================================================================
Heap::SortedMap::iterator Heap::EraseAllocaton(Heap::SortedMap::iterator& it) {
auto memory = it->first.second;
const device::Memory* dev_mem = memory->getDeviceMemory(*device_->devices()[0]);
void* dev_mem_vaddr = reinterpret_cast<void*>(dev_mem->virtualAddress());
total_size_ -= it->first.first;
if (dev_mem_vaddr != nullptr) {
amd::SvmBuffer::free(memory->getContext(), dev_mem_vaddr);
} else {
amd::SvmBuffer::free(memory->getContext(), memory->getSvmPtr());
}
// Clear HIP event
it->second.SetEvent(nullptr);
// Remove the allocation from the map
return allocations_.erase(it);
}
// ================================================================================================
bool Heap::ReleaseAllMemory(size_t min_bytes_to_hold, bool safe_release) {
for (auto it = allocations_.begin(); it != allocations_.end();) {
// Make sure the heap is smaller than the minimum value to hold
if (total_size_ <= min_bytes_to_hold) {
return true;
}
// Safe release forces unconditional wait for memory
if (safe_release) {
it->second.Wait();
}
if (it->second.IsSafeRelease()) {
it = EraseAllocaton(it);
} else {
++it;
}
}
return true;
}
// ================================================================================================
bool Heap::ReleaseAllMemory() {
for (auto it = allocations_.begin(); it != allocations_.end();) {
// Make sure the heap holds the minimum number of bytes
if (total_size_ <= release_threshold_) {
return true;
}
if (it->second.IsSafeRelease()) {
it = EraseAllocaton(it);
} else {
++it;
}
}
return true;
}
// ================================================================================================
void Heap::RemoveStream(Stream* stream) {
for (auto it : allocations_) {
it.second.safe_streams_.erase(stream);
}
}
// ================================================================================================
void Heap::SetAccess(hip::Device* device, bool enable) {
for (const auto& it : allocations_) {
auto peer_device = device->asContext()->devices()[0];
device::Memory* mem = it.first.second->getDeviceMemory(*peer_device);
if (mem != nullptr) {
if (!mem->getAllowedPeerAccess() && enable) {
// Enable p2p access for the specified device
peer_device->allowPeerAccess(mem);
mem->setAllowedPeerAccess(true);
} else if (mem->getAllowedPeerAccess() && !enable) {
mem->setAllowedPeerAccess(false);
}
} else {
LogError("Couldn't find device memory for P2P access");
}
}
}
// ================================================================================================
void* MemoryPool::AllocateMemory(size_t size, Stream* stream, void* dptr) {
amd::ScopedLock lock(lock_pool_ops_);
void* dev_ptr = nullptr;
MemoryTimestamp ts;
amd::Memory* memory = free_heap_.FindMemory(size, stream, Opportunistic(), dptr, &ts);
if (memory == nullptr) {
if (Properties().maxSize != 0 && (max_total_size_ + size) > Properties().maxSize) {
return nullptr;
}
amd::Context* context = device_->asContext();
const auto& dev_info = context->devices()[0]->info();
if (dev_info.maxMemAllocSize_ < size) {
return nullptr;
}
cl_svm_mem_flags flags = (state_.interprocess_) ? ROCCLR_MEM_INTERPROCESS : 0;
flags |= (state_.phys_mem_) ? ROCCLR_MEM_PHYMEM : 0;
dev_ptr = amd::SvmBuffer::malloc(*context, flags, size, dev_info.memBaseAddrAlign_, nullptr);
if (dev_ptr == nullptr) {
size_t free = 0, total =0;
hipError_t err = hipMemGetInfo(&free, &total);
if (err == hipSuccess) {
LogPrintfError("Allocation failed : Device memory : required :%zu | free :%zu | total :%zu",
size, free, total);
}
return nullptr;
}
size_t offset = 0;
memory = getMemoryObject(dev_ptr, offset);
// Saves the current device id so that it can be accessed later
memory->getUserData().deviceId = device_->deviceId();
// Update access for the new allocation from other devices
for (const auto& it : access_map_) {
auto vdi_device = it.first->asContext()->devices()[0];
device::Memory* mem = memory->getDeviceMemory(*vdi_device);
if ((mem != nullptr) && (it.second != hipMemAccessFlagsProtNone)) {
vdi_device->allowPeerAccess(mem);
mem->setAllowedPeerAccess(true);
}
}
} else {
dev_ptr = memory->getSvmPtr();
if (!amd::MemObjMap::FindMemObj(dev_ptr))
amd::MemObjMap::AddMemObj(dev_ptr, memory);
}
// Place the allocated memory into the busy heap
ts.AddSafeStream(stream);
busy_heap_.AddMemory(memory, ts);
max_total_size_ = std::max(max_total_size_, busy_heap_.GetTotalSize() +
free_heap_.GetTotalSize());
// Increment the reference counter on the pool
retain();
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Pool AllocMem: %p, %p", memory->getSvmPtr(), memory);
return dev_ptr;
}
// ================================================================================================
bool MemoryPool::FreeMemory(amd::Memory* memory, Stream* stream, Event* event) {
{
amd::ScopedLock lock(lock_pool_ops_);
if (memory->getUserData().phys_mem_obj != nullptr) {
memory = memory->getUserData().phys_mem_obj;
}
// If the free heap grows over the busy heap, then force release
if (AMD_DIRECT_DISPATCH && (free_heap_.GetTotalSize() > busy_heap_.GetTotalSize())) {
// Use event base release to reduce memory pressure
constexpr size_t kBytesToHold = 0;
free_heap_.ReleaseAllMemory(kBytesToHold);
// If free mmeory is less than 12.5% of total, then force wait release
size_t free = 0;
size_t total = 0;
hipError_t err = hipMemGetInfo(&free, &total);
if ((err == hipSuccess) && (free < (total >> 3))) {
constexpr bool kSafeRelease = true;
free_heap_.ReleaseAllMemory(free_heap_.GetTotalSize() >> 1, kSafeRelease);
}
}
MemoryTimestamp ts;
// Remove memory object from the busy pool
if (!busy_heap_.RemoveMemory(memory, &ts)) {
// This pool doesn't contain memory
return false;
}
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Pool FreeMem: %p, %p", memory->getSvmPtr(), memory);
if (memory->getUserData().vaddr_mem_obj != nullptr) {
auto va_mem = memory->getUserData().vaddr_mem_obj;
if (stream == nullptr) {
stream = g_devices[memory->getUserData().deviceId]->NullStream();
}
// Unmap virtual address from memory
auto cmd = new amd::VirtualMapCommand(*stream, amd::Command::EventWaitList{},
va_mem->getSvmPtr(), va_mem->getSize(), nullptr);
cmd->enqueue();
cmd->release();
}
if (stream != nullptr) {
// The stream of destruction is a safe stream, because the app must handle sync
ts.AddSafeStream(stream);
if (event == nullptr) {
// Add a marker to the stream to trace availability of this memory
Event* e = new hip::Event(0);
if (e != nullptr) {
if (hipSuccess == e->addMarker(reinterpret_cast<hipStream_t>(stream), nullptr, true)) {
ts.SetEvent(e);
// Make sure runtime sends a notification
auto result = e->ready();
}
}
} else {
ts.SetEvent(event);
}
} else {
// Assume a safe release from hipFree() if stream is nullptr
ts.SetEvent(nullptr);
}
free_heap_.AddMemory(memory, ts);
}
// Decrement the reference counter on the pool.
// Note: It may delete memory pool for the last allocation. Thus, the scope lock can't include
// this call.
release();
return true;
}
// ================================================================================================
void MemoryPool::ReleaseAllMemory() {
constexpr bool kSafeRelease = true;
free_heap_.ReleaseAllMemory(0, kSafeRelease);
busy_heap_.ReleaseAllMemory(0, kSafeRelease);
}
// ================================================================================================
void MemoryPool::ReleaseFreedMemory() {
amd::ScopedLock lock(lock_pool_ops_);
free_heap_.ReleaseAllMemory();
}
// ================================================================================================
void MemoryPool::RemoveStream(Stream* stream) {
amd::ScopedLock lock(lock_pool_ops_);
free_heap_.RemoveStream(stream);
}
// ================================================================================================
void MemoryPool::TrimTo(size_t min_bytes_to_hold) {
amd::ScopedLock lock(lock_pool_ops_);
free_heap_.ReleaseAllMemory(min_bytes_to_hold);
}
// ================================================================================================
hipError_t MemoryPool::SetAttribute(hipMemPoolAttr attr, void* value) {
amd::ScopedLock lock(lock_pool_ops_);
uint64_t reset;
switch (attr) {
case hipMemPoolReuseFollowEventDependencies:
// Enable/disable HIP events tracking from the app's dependencies
state_.event_dependencies_ = *reinterpret_cast<int32_t*>(value);
break;
case hipMemPoolReuseAllowOpportunistic:
// Enable/disable HIP event check for freed memory
state_.opportunistic_ = *reinterpret_cast<int32_t*>(value);
break;
case hipMemPoolReuseAllowInternalDependencies:
// Enable/disable internal extra dependencies introduced in runtime
state_.internal_dependencies_ = *reinterpret_cast<int32_t*>(value);
break;
case hipMemPoolAttrReleaseThreshold:
free_heap_.SetReleaseThreshold(*reinterpret_cast<uint64_t*>(value));
break;
case hipMemPoolAttrReservedMemCurrent:
// Should be GetAttribute only
return hipErrorInvalidValue;
break;
case hipMemPoolAttrReservedMemHigh:
reset = *reinterpret_cast<uint64_t*>(value);
// Only 0 is accepted
if (reset != 0) {
return hipErrorInvalidValue;
}
max_total_size_ = reset;
break;
case hipMemPoolAttrUsedMemCurrent:
// Should be GetAttribute only
return hipErrorInvalidValue;
break;
case hipMemPoolAttrUsedMemHigh:
reset = *reinterpret_cast<uint64_t*>(value);
// Only 0 is accepted
if (reset != 0) {
return hipErrorInvalidValue;
}
busy_heap_.SetMaxTotalSize(reset);
break;
default:
return hipErrorInvalidValue;
}
return hipSuccess;
}
// ================================================================================================
hipError_t MemoryPool::GetAttribute(hipMemPoolAttr attr, void* value) {
amd::ScopedLock lock(lock_pool_ops_);
switch (attr) {
case hipMemPoolReuseFollowEventDependencies:
// Enable/disable HIP events tracking from the app's dependencies
*reinterpret_cast<int32_t*>(value) = EventDependencies();
break;
case hipMemPoolReuseAllowOpportunistic:
// Enable/disable HIP event check for freed memory
*reinterpret_cast<int32_t*>(value) = Opportunistic();
break;
case hipMemPoolReuseAllowInternalDependencies:
// Enable/disable internal extra dependencies introduced in runtime
*reinterpret_cast<int32_t*>(value) = InternalDependencies();
break;
case hipMemPoolAttrReleaseThreshold:
*reinterpret_cast<uint64_t*>(value) = free_heap_.GetReleaseThreshold();
break;
case hipMemPoolAttrReservedMemCurrent:
// All allocate memory by the pool in OS
*reinterpret_cast<uint64_t*>(value) = busy_heap_.GetTotalSize() + free_heap_.GetTotalSize();
break;
case hipMemPoolAttrReservedMemHigh:
// High watermark of all allocated memory in OS, since the last reset
*reinterpret_cast<uint64_t*>(value) = max_total_size_;
break;
case hipMemPoolAttrUsedMemCurrent:
// Total currently used memory by the pool
*reinterpret_cast<uint64_t*>(value) = busy_heap_.GetTotalSize();
break;
case hipMemPoolAttrUsedMemHigh:
// High watermark of all used memoryS, since the last reset
*reinterpret_cast<uint64_t*>(value) = busy_heap_.GetMaxTotalSize();
break;
default:
return hipErrorInvalidValue;
}
return hipSuccess;
}
// ================================================================================================
void MemoryPool::SetAccess(hip::Device* device, hipMemAccessFlags flags) {
amd::ScopedLock lock(lock_pool_ops_);
// Check if the requested device is the pool device where memory was allocated
if (device == device_) {
return;
}
hipMemAccessFlags current_flags = hipMemAccessFlagsProtNone;
// Check if access was enabled before
if (access_map_.find(device) != access_map_.end()) {
current_flags = access_map_[device];
}
if (current_flags != flags) {
bool enable_access = false;
// Save the access state in the device map
access_map_[device] = flags;
// Check if access is enabled
if ((flags == hipMemAccessFlagsProtRead) || (flags == hipMemAccessFlagsProtReadWrite)) {
enable_access = true;
}
// Update device access on the both pools
busy_heap_.SetAccess(device, enable_access);
free_heap_.SetAccess(device, enable_access);
}
}
// ================================================================================================
void MemoryPool::GetAccess(hip::Device* device, hipMemAccessFlags* flags) {
amd::ScopedLock lock(lock_pool_ops_);
// Current pool device has full access to memory allocation
*flags = (device == device_) ? hipMemAccessFlagsProtReadWrite : hipMemAccessFlagsProtNone;
// Check if access was enabled before
if (access_map_.find(device) != access_map_.end()) {
*flags = access_map_[device];
}
}
// ================================================================================================
void MemoryPool::FreeAllMemory(Stream* stream) {
while (!busy_heap_.Allocations().empty()) {
FreeMemory(busy_heap_.Allocations().begin()->first.second, stream);
}
}
// ================================================================================================
amd::Os::FileDesc MemoryPool::Export() {
amd::ScopedLock lock(lock_pool_ops_);
if (shared_ != nullptr) {
return shared_->handle_;
}
constexpr uint32_t kFileNameSize = 20;
char file_name[kFileNameSize];
// Generate a unique name from the mempool pointer
// Note: Windows can accept an unnamed allocation
snprintf(file_name, kFileNameSize, "%p", this);
amd::Os::FileDesc handle{};
shared_ = reinterpret_cast<SharedMemPool*>(amd::Os::CreateIpcMemory(
file_name, sizeof(SharedMemPool), &handle));
if (shared_ != nullptr) {
shared_->handle_ = handle;
shared_->state_ = state_.value_;
shared_->access_size_ = 0;
memset(shared_->access_, 0, sizeof(SharedAccess) * kMaxMgpuAccess);
assert((access_map_.size() <= kMaxMgpuAccess) && "Can't support more GPU(s) in shared access" );
for (auto it : access_map_) {
shared_->access_[shared_->access_size_] = SharedAccess{it.first->deviceId(), it.second};
shared_->access_size_++;
}
}
return handle;
}
// ================================================================================================
bool MemoryPool::Import(amd::Os::FileDesc handle) {
amd::ScopedLock lock(lock_pool_ops_);
bool result = false;
auto shared = reinterpret_cast<SharedMemPool*>(
amd::Os::OpenIpcMemory(nullptr, handle, sizeof(SharedMemPool)));
if (shared != nullptr) {
state_.value_ = shared->state_;
for (uint32_t i = 0; i < shared->access_size_; ++i) {
access_map_[g_devices[shared->access_[i].device_id_]] = shared->access_[i].flags_;
}
result = true;
}
return result;
}
}