Cache scratch allocations.

Avoids calling to KFD to map/unmap scratch allocations for
every large scratch using dispatch.

Change-Id: I9fab5705251ec82b03e4f2f2ca6da7cdccabefb9
This commit is contained in:
Sean Keely
2020-10-07 06:41:19 -05:00
förälder 32d0fcafa9
incheckning 27e044ae4d
8 ändrade filer med 383 tillägg och 98 borttagningar
+4 -2
Visa fil
@@ -61,8 +61,6 @@ class Signal;
typedef void (*HsaEventCallback)(hsa_status_t status, hsa_queue_t* source,
void* data);
class MemoryRegion;
// Agent is intended to be an pure interface class and may be wrapped or
// replaced by tools libraries. All funtions other than Convert, node_id,
// device_type, and public_handle must be virtual.
@@ -260,6 +258,10 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
return stat;
}
virtual void Trim() {
for (auto region : regions()) region->Trim();
}
protected:
// Intention here is to have a polymorphic update procedure for public_handle_
// which is callable on any Agent* but only from some class dervied from
+10 -12
Visa fil
@@ -55,6 +55,7 @@
#include "core/inc/blit.h"
#include "core/inc/signal.h"
#include "core/inc/cache.h"
#include "core/inc/scratch_cache.h"
#include "core/util/small_heap.h"
#include "core/util/locks.h"
#include "core/util/lazy_ptr.h"
@@ -63,18 +64,7 @@ namespace rocr {
namespace AMD {
class MemoryRegion;
// @brief Contains scratch memory information.
struct ScratchInfo {
void* queue_base;
size_t size;
size_t size_per_thread;
uint32_t lanes_per_wave;
ptrdiff_t queue_process_offset;
bool large;
bool retry;
hsa_signal_t queue_retry;
uint64_t wanted_slots;
};
typedef ScratchCache::ScratchInfo ScratchInfo;
// @brief Interface to represent a GPU agent.
class GpuAgentInt : public core::Agent {
@@ -331,6 +321,8 @@ class GpuAgent : public GpuAgentInt {
return memory_max_frequency_;
}
void Trim() override;
protected:
static const uint32_t minAqlSize_ = 0x1000; // 4KB min
static const uint32_t maxAqlSize_ = 0x20000; // 8MB max
@@ -494,6 +486,10 @@ class GpuAgent : public GpuAgentInt {
// @brief Deregister scratch notification signals.
void ClearScratchNotifiers() { scratch_notifiers_.clear(); }
// @brief Releases scratch back to the driver.
// caller must hold scratch_lock_.
void ReleaseScratch(void* base, size_t size, bool large);
// Bind index of peer device that is connected via xGMI links
lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent);
@@ -517,6 +513,8 @@ class GpuAgent : public GpuAgentInt {
KernelMutex lock_;
} gws_queue_;
ScratchCache scratch_cache_;
DISALLOW_COPY_AND_ASSIGN(GpuAgent);
};
@@ -136,6 +136,8 @@ class MemoryRegion : public core::MemoryRegion {
hsa_status_t AssignAgent(void* ptr, size_t size, const core::Agent& agent,
hsa_access_permission_t access) const;
void Trim() const;
__forceinline bool IsLocalMemory() const {
return ((mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE) ||
(mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC));
+5 -1
Visa fil
@@ -47,8 +47,9 @@
#include <vector>
#include "core/inc/agent.h"
#include "core/inc/hsa_internal.h"
#include "core/inc/checked.h"
#include "core/util/utils.h"
namespace rocr {
namespace core {
@@ -106,6 +107,9 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
virtual hsa_status_t AssignAgent(void* ptr, size_t size, const Agent& agent,
hsa_access_permission_t access) const = 0;
// Releases any cached memory that may be held within the allocator.
virtual void Trim() const {}
__forceinline bool fine_grain() const { return fine_grain_; }
__forceinline bool full_profile() const { return full_profile_; }
+191
Visa fil
@@ -0,0 +1,191 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2020-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef HSA_RUNTIME_CORE_INC_SCRATCH_CACHE_H_
#define HSA_RUNTIME_CORE_INC_SCRATCH_CACHE_H_
#include "core/inc/amd_gpu_agent.h"
#include "core/util/locks.h"
#include "core/util/utils.h"
#include <map>
#include <functional>
namespace rocr {
namespace AMD {
class ScratchCache {
public:
struct node {
enum STATE { FREE = 0, ALLOC = 1, TRIM = 2, STEAL = 4 };
void* base;
bool large;
uint32_t state;
node() : base(nullptr), state(FREE) {}
bool isFree() const { return state == FREE; }
bool trimPending() const { return state == (ALLOC | TRIM); }
void trim() {
assert(!isFree() && "Trim of free scratch node.");
state |= TRIM;
}
void free() {
assert(!isFree() && "Free of free scratch node.");
state = FREE;
}
void alloc() {
assert(isFree() && "Alloc of non-free scratch node.");
state = ALLOC;
}
};
typedef ::std::multimap<size_t, node> map_t;
typedef map_t::iterator ref_t;
typedef ::std::function<void(void*, size_t, bool)> deallocator_t;
// @brief Contains scratch memory information.
struct ScratchInfo {
void* queue_base;
// Size to fill the machine with size_per_thread
size_t size;
// Size to satisfy the present dispatch without throttling.
size_t dispatch_size;
size_t size_per_thread;
uint32_t lanes_per_wave;
ptrdiff_t queue_process_offset;
bool large;
bool retry;
hsa_signal_t queue_retry;
uint64_t wanted_slots;
ScratchCache::ref_t scratch_node;
};
ScratchCache(const ScratchCache& rhs) = delete;
ScratchCache(ScratchCache&& rhs) = delete;
ScratchCache& operator=(const ScratchCache& rhs) = delete;
ScratchCache& operator=(ScratchCache&& rhs) = delete;
ScratchCache(deallocator_t deallocator) : dealloc(deallocator) {}
~ScratchCache() { assert(map.empty() && "ScratchCache not empty at shutdown."); }
bool alloc(ScratchInfo& info) {
ref_t it = map.upper_bound(info.size - 1);
if (it == map.end()) return false;
// Small requests must have an exact size match and be small.
if (!info.large) {
while ((it != map.end()) && (it->first == info.size)) {
if (it->second.isFree() && (!it->second.large)) {
it->second.alloc();
info.queue_base = it->second.base;
info.scratch_node = it;
return true;
}
it++;
}
return false;
}
// Large requests may use a small allocation and do not require an exact size match.
while (it != map.end()) {
if (it->second.isFree()) {
it->second.alloc();
info.queue_base = it->second.base;
info.size = it->first;
info.scratch_node = it;
return true;
}
it++;
}
return false;
}
void free(ScratchInfo& info) {
assert(!info.scratch_node->second.isFree() && "free called on free scratch node.");
auto it = info.scratch_node;
if (it->second.trimPending()) {
dealloc(it->second.base, it->first, it->second.large);
map.erase(it);
return;
}
it->second.free();
}
bool trim(bool trim_nodes_in_use) {
bool ret = !map.empty();
auto it = map.begin();
while (it != map.end()) {
if (it->second.isFree()) {
dealloc(it->second.base, it->first, it->second.large);
auto temp = it;
it++;
map.erase(temp);
} else {
if (trim_nodes_in_use) it->second.trim();
it++;
}
}
return ret;
}
void insert(ScratchInfo& info) {
node n;
n.base = info.queue_base;
n.large = info.large;
n.alloc();
auto it = map.insert(std::make_pair(info.size, n));
info.scratch_node = it;
}
private:
map_t map;
deallocator_t dealloc;
};
} // namespace AMD
} // namespace rocr
#endif // header guard
@@ -792,11 +792,25 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
// Align whole waves to 1KB.
scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave;
#ifndef NDEBUG
scratch.wanted_slots = ((uint64_t(pkt.dispatch.grid_size_x) * pkt.dispatch.grid_size_y) *
pkt.dispatch.grid_size_z) / scratch.lanes_per_wave;
uint64_t lanes_per_group =
(uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) *
pkt.dispatch.workgroup_size_z;
uint64_t waves_per_group =
(lanes_per_group + scratch.lanes_per_wave - 1) / scratch.lanes_per_wave;
uint64_t groups = ((uint64_t(pkt.dispatch.grid_size_x) + pkt.dispatch.workgroup_size_x - 1) /
pkt.dispatch.workgroup_size_x) *
((uint64_t(pkt.dispatch.grid_size_y) + pkt.dispatch.workgroup_size_y - 1) /
pkt.dispatch.workgroup_size_y) *
((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) /
pkt.dispatch.workgroup_size_z);
scratch.wanted_slots = groups * waves_per_group;
scratch.wanted_slots = Min(scratch.wanted_slots, uint64_t(MaxScratchSlots));
#endif
scratch.wanted_slots =
Max(scratch.wanted_slots, uint64_t(queue->agent_->properties().NumShaderBanks));
scratch.dispatch_size =
scratch.size_per_thread * scratch.wanted_slots * scratch.lanes_per_wave;
queue->agent_->AcquireQueueScratch(scratch);
@@ -1117,7 +1131,7 @@ void AqlQueue::InitScratchSRD() {
queue_scratch_.size_per_thread) + 1023) / 1024);
tmpring_size.bits.WAVESIZE = wave_scratch;
assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024));
uint32_t num_waves = queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024);
tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves);
amd_queue_.compute_tmpring_size = tmpring_size.u32All;
return;
+149 -77
Visa fil
@@ -91,7 +91,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
memory_bus_width_(0),
memory_max_frequency_(0),
ape1_base_(0),
ape1_size_(0) {
ape1_size_(0),
scratch_cache_(
[this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) {
const bool is_apu_node = (properties_.NumCPUCores > 0);
profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
@@ -171,6 +173,7 @@ GpuAgent::~GpuAgent() {
_aligned_free(reinterpret_cast<void*>(ape1_base_));
}
scratch_cache_.trim(true);
if (scratch_pool_.base() != NULL) {
hsaKmtFreeMemory(scratch_pool_.base(), scratch_pool_.size());
}
@@ -1007,7 +1010,8 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
scratch.size_per_thread = private_segment_size;
const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
scratch.size = scratch.size_per_thread * 32 * scratch.lanes_per_wave * num_cu;
scratch.size =
scratch.size_per_thread * properties_.MaxSlotsScratchCU * scratch.lanes_per_wave * num_cu;
scratch.queue_base = nullptr;
scratch.queue_process_offset = 0;
@@ -1053,104 +1057,166 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
uint64_t size_per_wave = AlignUp(scratch.size_per_thread * properties_.WaveFrontSize, 1024);
if (size_per_wave > MAX_WAVE_SCRATCH) return;
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
// Limit to 1/8th of scratch pool for small scratch and 1/4 of that for a single queue.
/*
Determine size class needed.
Scratch allocations come in two flavors based on how it is retired. Small allocations may be
kept bound to a queue and reused by firmware. This memory can not be reclaimed by the runtime
on demand so must be kept small to avoid egregious OOM conditions. Other allocations, aka large,
may be used by firmware only for one dispatch and are then surrendered to the runtime. This has
significant latency so we don't want to make all scratch allocations large (ie single use).
Note that the designation "large" is for contrast with "small", which must really be small
amounts of memory, and does not always imply a large quantity of memory is needed. Other
properties of the allocation may require single use and so qualify the allocation or use as
"large".
Here we decide on the boundaries for small scratch allocations. Both the largest small single
allocation and the maximum amount of memory bound by small allocations are limited. Additionally
some legacy devices do not support large scratch.
For small scratch we must allocate enough memory for every physical scratch slot.
For large scratch compute the minimum memory needed to run the dispatch without limiting
occupancy.
Limit total bound small scratch allocations to 1/8th of scratch pool and 1/4 of that for a single
allocation.
*/
size_t small_limit = scratch_pool_.size() >> 3;
// Lift limit for 2.10 release RCCL workaround.
size_t single_limit = 146800640; //small_limit >> 2;
bool use_reclaim = true;
bool large = (scratch.size > single_limit) ||
(scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit);
large = (isa_->GetMajorVersion() < 8) ? false : large;
large = core::Runtime::runtime_singleton_->flag().no_scratch_reclaim() ? false : large;
if (large)
scratch.queue_base = scratch_pool_.alloc_high(scratch.size);
else
scratch.queue_base = scratch_pool_.alloc(scratch.size);
large |= scratch.queue_base > scratch_pool_.high_split();
scratch.large = large;
if ((isa_->GetMajorVersion() < 8) ||
core::Runtime::runtime_singleton_->flag().no_scratch_reclaim()) {
large = false;
use_reclaim = false;
}
scratch.queue_process_offset =
(need_queue_scratch_base)
? uintptr_t(scratch.queue_base)
: uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
// If large is selected then the scratch will not be retained.
// In that case allocate the minimum necessary for the dispatch since we don't need all slots.
if (large) scratch.size = scratch.dispatch_size;
if (scratch.queue_base != nullptr) {
if (profile_ == HSA_PROFILE_FULL) return;
if (profile_ == HSA_PROFILE_BASE) {
HSAuint64 alternate_va;
if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
HSAKMT_STATUS_SUCCESS) {
if (large) scratch_used_large_ += scratch.size;
return;
// Ensure mapping will be in whole pages.
scratch.size = AlignUp(scratch.size, 4096);
/*
Sequence of attempts is:
check cache
attempt a new allocation
trim unused blocks from cache
attempt a new allocation
check cache for sufficient used block, steal and wait (not implemented)
trim used blocks from cache, evaluate retry
reduce occupancy
*/
// Lambda called in place.
// Used to allow exit from nested loops.
[&]() {
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
// Check scratch cache
if (scratch_cache_.alloc(scratch)) return;
// Attempt new allocation.
for (int i = 0; i < 2; i++) {
if (large)
scratch.queue_base = scratch_pool_.alloc_high(scratch.size);
else
scratch.queue_base = scratch_pool_.alloc(scratch.size);
scratch.large = large | (scratch.queue_base > scratch_pool_.high_split());
assert(((!scratch.large) | use_reclaim) && "Large scratch used with reclaim disabled.");
if (scratch.queue_base != nullptr) {
if (profile_ == HSA_PROFILE_FULL) return;
if (profile_ == HSA_PROFILE_BASE) {
HSAuint64 alternate_va;
if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
HSAKMT_STATUS_SUCCESS) {
if (scratch.large) scratch_used_large_ += scratch.size;
scratch_cache_.insert(scratch);
return;
}
}
}
// Scratch request failed allocation or mapping.
scratch_pool_.free(scratch.queue_base);
scratch.queue_base = nullptr;
// Release cached scratch and retry.
// First iteration trims unused blocks, second trims all.
scratch_cache_.trim(i == 1);
}
}
// Scratch request failed allocation or mapping.
scratch_pool_.free(scratch.queue_base);
scratch.queue_base = nullptr;
// Retry if large may yield needed space.
if (scratch_used_large_ != 0) {
if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true;
return;
}
// Fail scratch allocation if reducing occupancy is disabled.
if (core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter()) return;
// Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n",
scratch.size);
const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
const uint64_t total_waves = scratch.size / size_per_wave;
uint64_t waves_per_cu = total_waves / num_cus;
while (waves_per_cu != 0) {
size_t size = waves_per_cu * num_cus * size_per_wave;
void* base = scratch_pool_.alloc(size);
HSAuint64 alternate_va;
if ((base != nullptr) &&
((profile_ == HSA_PROFILE_FULL) ||
(hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) {
// Scratch allocated and either full profile or map succeeded.
scratch.queue_base = base;
scratch.size = size;
scratch.queue_process_offset = (need_queue_scratch_base)
? uintptr_t(scratch.queue_base)
: uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
scratch.large = true;
scratch_used_large_ += scratch.size;
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
debug_print(" %ld scratch mapped, %.2f%% occupancy.\n", scratch.size,
float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f);
// Retry if large may yield needed space.
if (scratch_used_large_ != 0) {
if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true;
return;
}
scratch_pool_.free(base);
waves_per_cu--;
}
// Failed to allocate minimal scratch
assert(scratch.queue_base == nullptr && "bad scratch data");
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
debug_print(" Could not allocate scratch for one wave per CU.\n");
// Fail scratch allocation if reducing occupancy is disabled.
if ((!use_reclaim) || core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter())
return;
// Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n",
scratch.size);
const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
const uint64_t total_waves = scratch.size / size_per_wave;
uint64_t waves_per_cu = total_waves / num_cus;
while (waves_per_cu != 0) {
size_t size = waves_per_cu * num_cus * size_per_wave;
void* base = scratch_pool_.alloc_high(size);
HSAuint64 alternate_va;
if ((base != nullptr) &&
((profile_ == HSA_PROFILE_FULL) ||
(hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) {
// Scratch allocated and either full profile or map succeeded.
scratch.queue_base = base;
scratch.size = size;
scratch.large = true;
scratch_used_large_ += scratch.size;
scratch_cache_.insert(scratch);
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
debug_print(" %ld scratch mapped, %.2f%% occupancy.\n", scratch.size,
float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f);
return;
}
scratch_pool_.free(base);
waves_per_cu--;
}
// Failed to allocate minimal scratch
assert(scratch.queue_base == nullptr && "bad scratch data");
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
debug_print(" Could not allocate scratch for one wave per CU.\n");
return;
}();
scratch.queue_process_offset = need_queue_scratch_base
? uintptr_t(scratch.queue_base)
: uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
}
void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
if (scratch.queue_base == nullptr) {
return;
}
if (scratch.queue_base == nullptr) return;
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
scratch_cache_.free(scratch);
scratch.queue_base = nullptr;
}
void GpuAgent::ReleaseScratch(void* base, size_t size, bool large) {
if (profile_ == HSA_PROFILE_BASE) {
if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(scratch.queue_base)) {
if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(base)) {
assert(false && "Unmap scratch subrange failed!");
}
}
scratch_pool_.free(scratch.queue_base);
scratch.queue_base = nullptr;
scratch_pool_.free(base);
if (scratch.large) scratch_used_large_ -= scratch.size;
if (large) scratch_used_large_ -= size;
// Notify waiters that additional scratch may be available.
for (auto notifier : scratch_notifiers_) {
@@ -1444,5 +1510,11 @@ lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
return GetXgmiBlit(dst_agent);
}
void GpuAgent::Trim() {
Agent::Trim();
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
scratch_cache_.trim(false);
}
} // namespace amd
} // namespace rocr
@@ -201,7 +201,7 @@ hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, voi
// If it fails attempt to release memory from the block allocator and retry.
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
if (*address == nullptr) {
fragment_allocator_.trim();
owner()->Trim();
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
}
@@ -699,6 +699,8 @@ hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size,
return HSA_STATUS_SUCCESS;
}
void MemoryRegion::Trim() const { fragment_allocator_.trim(); }
void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const {
assert(request_size <= block_size() && "BlockAllocator alloc request exceeds block size.");