Cache scratch allocations.
Avoids calling to KFD to map/unmap scratch allocations for every large scratch using dispatch. Change-Id: I9fab5705251ec82b03e4f2f2ca6da7cdccabefb9
This commit is contained in:
@@ -61,8 +61,6 @@ class Signal;
|
||||
typedef void (*HsaEventCallback)(hsa_status_t status, hsa_queue_t* source,
|
||||
void* data);
|
||||
|
||||
class MemoryRegion;
|
||||
|
||||
// Agent is intended to be an pure interface class and may be wrapped or
|
||||
// replaced by tools libraries. All funtions other than Convert, node_id,
|
||||
// device_type, and public_handle must be virtual.
|
||||
@@ -260,6 +258,10 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
|
||||
return stat;
|
||||
}
|
||||
|
||||
virtual void Trim() {
|
||||
for (auto region : regions()) region->Trim();
|
||||
}
|
||||
|
||||
protected:
|
||||
// Intention here is to have a polymorphic update procedure for public_handle_
|
||||
// which is callable on any Agent* but only from some class dervied from
|
||||
|
||||
@@ -55,6 +55,7 @@
|
||||
#include "core/inc/blit.h"
|
||||
#include "core/inc/signal.h"
|
||||
#include "core/inc/cache.h"
|
||||
#include "core/inc/scratch_cache.h"
|
||||
#include "core/util/small_heap.h"
|
||||
#include "core/util/locks.h"
|
||||
#include "core/util/lazy_ptr.h"
|
||||
@@ -63,18 +64,7 @@ namespace rocr {
|
||||
namespace AMD {
|
||||
class MemoryRegion;
|
||||
|
||||
// @brief Contains scratch memory information.
|
||||
struct ScratchInfo {
|
||||
void* queue_base;
|
||||
size_t size;
|
||||
size_t size_per_thread;
|
||||
uint32_t lanes_per_wave;
|
||||
ptrdiff_t queue_process_offset;
|
||||
bool large;
|
||||
bool retry;
|
||||
hsa_signal_t queue_retry;
|
||||
uint64_t wanted_slots;
|
||||
};
|
||||
typedef ScratchCache::ScratchInfo ScratchInfo;
|
||||
|
||||
// @brief Interface to represent a GPU agent.
|
||||
class GpuAgentInt : public core::Agent {
|
||||
@@ -331,6 +321,8 @@ class GpuAgent : public GpuAgentInt {
|
||||
return memory_max_frequency_;
|
||||
}
|
||||
|
||||
void Trim() override;
|
||||
|
||||
protected:
|
||||
static const uint32_t minAqlSize_ = 0x1000; // 4KB min
|
||||
static const uint32_t maxAqlSize_ = 0x20000; // 8MB max
|
||||
@@ -494,6 +486,10 @@ class GpuAgent : public GpuAgentInt {
|
||||
// @brief Deregister scratch notification signals.
|
||||
void ClearScratchNotifiers() { scratch_notifiers_.clear(); }
|
||||
|
||||
// @brief Releases scratch back to the driver.
|
||||
// caller must hold scratch_lock_.
|
||||
void ReleaseScratch(void* base, size_t size, bool large);
|
||||
|
||||
// Bind index of peer device that is connected via xGMI links
|
||||
lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent);
|
||||
|
||||
@@ -517,6 +513,8 @@ class GpuAgent : public GpuAgentInt {
|
||||
KernelMutex lock_;
|
||||
} gws_queue_;
|
||||
|
||||
ScratchCache scratch_cache_;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(GpuAgent);
|
||||
};
|
||||
|
||||
|
||||
@@ -136,6 +136,8 @@ class MemoryRegion : public core::MemoryRegion {
|
||||
hsa_status_t AssignAgent(void* ptr, size_t size, const core::Agent& agent,
|
||||
hsa_access_permission_t access) const;
|
||||
|
||||
void Trim() const;
|
||||
|
||||
__forceinline bool IsLocalMemory() const {
|
||||
return ((mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE) ||
|
||||
(mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC));
|
||||
|
||||
@@ -47,8 +47,9 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "core/inc/agent.h"
|
||||
#include "core/inc/hsa_internal.h"
|
||||
#include "core/inc/checked.h"
|
||||
#include "core/util/utils.h"
|
||||
|
||||
namespace rocr {
|
||||
namespace core {
|
||||
@@ -106,6 +107,9 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
|
||||
virtual hsa_status_t AssignAgent(void* ptr, size_t size, const Agent& agent,
|
||||
hsa_access_permission_t access) const = 0;
|
||||
|
||||
// Releases any cached memory that may be held within the allocator.
|
||||
virtual void Trim() const {}
|
||||
|
||||
__forceinline bool fine_grain() const { return fine_grain_; }
|
||||
|
||||
__forceinline bool full_profile() const { return full_profile_; }
|
||||
|
||||
@@ -0,0 +1,191 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2020-2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef HSA_RUNTIME_CORE_INC_SCRATCH_CACHE_H_
|
||||
#define HSA_RUNTIME_CORE_INC_SCRATCH_CACHE_H_
|
||||
|
||||
#include "core/inc/amd_gpu_agent.h"
|
||||
#include "core/util/locks.h"
|
||||
#include "core/util/utils.h"
|
||||
|
||||
#include <map>
|
||||
#include <functional>
|
||||
|
||||
namespace rocr {
|
||||
namespace AMD {
|
||||
|
||||
class ScratchCache {
|
||||
public:
|
||||
struct node {
|
||||
enum STATE { FREE = 0, ALLOC = 1, TRIM = 2, STEAL = 4 };
|
||||
void* base;
|
||||
bool large;
|
||||
uint32_t state;
|
||||
|
||||
node() : base(nullptr), state(FREE) {}
|
||||
|
||||
bool isFree() const { return state == FREE; }
|
||||
bool trimPending() const { return state == (ALLOC | TRIM); }
|
||||
|
||||
void trim() {
|
||||
assert(!isFree() && "Trim of free scratch node.");
|
||||
state |= TRIM;
|
||||
}
|
||||
void free() {
|
||||
assert(!isFree() && "Free of free scratch node.");
|
||||
state = FREE;
|
||||
}
|
||||
void alloc() {
|
||||
assert(isFree() && "Alloc of non-free scratch node.");
|
||||
state = ALLOC;
|
||||
}
|
||||
};
|
||||
|
||||
typedef ::std::multimap<size_t, node> map_t;
|
||||
typedef map_t::iterator ref_t;
|
||||
typedef ::std::function<void(void*, size_t, bool)> deallocator_t;
|
||||
|
||||
// @brief Contains scratch memory information.
|
||||
struct ScratchInfo {
|
||||
void* queue_base;
|
||||
// Size to fill the machine with size_per_thread
|
||||
size_t size;
|
||||
// Size to satisfy the present dispatch without throttling.
|
||||
size_t dispatch_size;
|
||||
size_t size_per_thread;
|
||||
uint32_t lanes_per_wave;
|
||||
ptrdiff_t queue_process_offset;
|
||||
bool large;
|
||||
bool retry;
|
||||
hsa_signal_t queue_retry;
|
||||
uint64_t wanted_slots;
|
||||
ScratchCache::ref_t scratch_node;
|
||||
};
|
||||
|
||||
ScratchCache(const ScratchCache& rhs) = delete;
|
||||
ScratchCache(ScratchCache&& rhs) = delete;
|
||||
ScratchCache& operator=(const ScratchCache& rhs) = delete;
|
||||
ScratchCache& operator=(ScratchCache&& rhs) = delete;
|
||||
|
||||
ScratchCache(deallocator_t deallocator) : dealloc(deallocator) {}
|
||||
|
||||
~ScratchCache() { assert(map.empty() && "ScratchCache not empty at shutdown."); }
|
||||
|
||||
bool alloc(ScratchInfo& info) {
|
||||
ref_t it = map.upper_bound(info.size - 1);
|
||||
if (it == map.end()) return false;
|
||||
|
||||
// Small requests must have an exact size match and be small.
|
||||
if (!info.large) {
|
||||
while ((it != map.end()) && (it->first == info.size)) {
|
||||
if (it->second.isFree() && (!it->second.large)) {
|
||||
it->second.alloc();
|
||||
info.queue_base = it->second.base;
|
||||
info.scratch_node = it;
|
||||
return true;
|
||||
}
|
||||
it++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Large requests may use a small allocation and do not require an exact size match.
|
||||
while (it != map.end()) {
|
||||
if (it->second.isFree()) {
|
||||
it->second.alloc();
|
||||
info.queue_base = it->second.base;
|
||||
info.size = it->first;
|
||||
info.scratch_node = it;
|
||||
return true;
|
||||
}
|
||||
it++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void free(ScratchInfo& info) {
|
||||
assert(!info.scratch_node->second.isFree() && "free called on free scratch node.");
|
||||
auto it = info.scratch_node;
|
||||
if (it->second.trimPending()) {
|
||||
dealloc(it->second.base, it->first, it->second.large);
|
||||
map.erase(it);
|
||||
return;
|
||||
}
|
||||
it->second.free();
|
||||
}
|
||||
|
||||
bool trim(bool trim_nodes_in_use) {
|
||||
bool ret = !map.empty();
|
||||
auto it = map.begin();
|
||||
while (it != map.end()) {
|
||||
if (it->second.isFree()) {
|
||||
dealloc(it->second.base, it->first, it->second.large);
|
||||
auto temp = it;
|
||||
it++;
|
||||
map.erase(temp);
|
||||
} else {
|
||||
if (trim_nodes_in_use) it->second.trim();
|
||||
it++;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void insert(ScratchInfo& info) {
|
||||
node n;
|
||||
n.base = info.queue_base;
|
||||
n.large = info.large;
|
||||
n.alloc();
|
||||
|
||||
auto it = map.insert(std::make_pair(info.size, n));
|
||||
info.scratch_node = it;
|
||||
}
|
||||
|
||||
private:
|
||||
map_t map;
|
||||
deallocator_t dealloc;
|
||||
};
|
||||
|
||||
} // namespace AMD
|
||||
} // namespace rocr
|
||||
|
||||
#endif // header guard
|
||||
@@ -792,11 +792,25 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
|
||||
// Align whole waves to 1KB.
|
||||
scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
|
||||
scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave;
|
||||
#ifndef NDEBUG
|
||||
scratch.wanted_slots = ((uint64_t(pkt.dispatch.grid_size_x) * pkt.dispatch.grid_size_y) *
|
||||
pkt.dispatch.grid_size_z) / scratch.lanes_per_wave;
|
||||
|
||||
uint64_t lanes_per_group =
|
||||
(uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) *
|
||||
pkt.dispatch.workgroup_size_z;
|
||||
uint64_t waves_per_group =
|
||||
(lanes_per_group + scratch.lanes_per_wave - 1) / scratch.lanes_per_wave;
|
||||
uint64_t groups = ((uint64_t(pkt.dispatch.grid_size_x) + pkt.dispatch.workgroup_size_x - 1) /
|
||||
pkt.dispatch.workgroup_size_x) *
|
||||
((uint64_t(pkt.dispatch.grid_size_y) + pkt.dispatch.workgroup_size_y - 1) /
|
||||
pkt.dispatch.workgroup_size_y) *
|
||||
((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) /
|
||||
pkt.dispatch.workgroup_size_z);
|
||||
|
||||
scratch.wanted_slots = groups * waves_per_group;
|
||||
scratch.wanted_slots = Min(scratch.wanted_slots, uint64_t(MaxScratchSlots));
|
||||
#endif
|
||||
scratch.wanted_slots =
|
||||
Max(scratch.wanted_slots, uint64_t(queue->agent_->properties().NumShaderBanks));
|
||||
scratch.dispatch_size =
|
||||
scratch.size_per_thread * scratch.wanted_slots * scratch.lanes_per_wave;
|
||||
|
||||
queue->agent_->AcquireQueueScratch(scratch);
|
||||
|
||||
@@ -1117,7 +1131,7 @@ void AqlQueue::InitScratchSRD() {
|
||||
queue_scratch_.size_per_thread) + 1023) / 1024);
|
||||
tmpring_size.bits.WAVESIZE = wave_scratch;
|
||||
assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
|
||||
uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024));
|
||||
uint32_t num_waves = queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024);
|
||||
tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves);
|
||||
amd_queue_.compute_tmpring_size = tmpring_size.u32All;
|
||||
return;
|
||||
|
||||
@@ -91,7 +91,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
|
||||
memory_bus_width_(0),
|
||||
memory_max_frequency_(0),
|
||||
ape1_base_(0),
|
||||
ape1_size_(0) {
|
||||
ape1_size_(0),
|
||||
scratch_cache_(
|
||||
[this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) {
|
||||
const bool is_apu_node = (properties_.NumCPUCores > 0);
|
||||
profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
|
||||
|
||||
@@ -171,6 +173,7 @@ GpuAgent::~GpuAgent() {
|
||||
_aligned_free(reinterpret_cast<void*>(ape1_base_));
|
||||
}
|
||||
|
||||
scratch_cache_.trim(true);
|
||||
if (scratch_pool_.base() != NULL) {
|
||||
hsaKmtFreeMemory(scratch_pool_.base(), scratch_pool_.size());
|
||||
}
|
||||
@@ -1007,7 +1010,8 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
|
||||
scratch.size_per_thread = private_segment_size;
|
||||
|
||||
const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
|
||||
scratch.size = scratch.size_per_thread * 32 * scratch.lanes_per_wave * num_cu;
|
||||
scratch.size =
|
||||
scratch.size_per_thread * properties_.MaxSlotsScratchCU * scratch.lanes_per_wave * num_cu;
|
||||
scratch.queue_base = nullptr;
|
||||
scratch.queue_process_offset = 0;
|
||||
|
||||
@@ -1053,104 +1057,166 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
|
||||
uint64_t size_per_wave = AlignUp(scratch.size_per_thread * properties_.WaveFrontSize, 1024);
|
||||
if (size_per_wave > MAX_WAVE_SCRATCH) return;
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
|
||||
// Limit to 1/8th of scratch pool for small scratch and 1/4 of that for a single queue.
|
||||
/*
|
||||
Determine size class needed.
|
||||
|
||||
Scratch allocations come in two flavors based on how it is retired. Small allocations may be
|
||||
kept bound to a queue and reused by firmware. This memory can not be reclaimed by the runtime
|
||||
on demand so must be kept small to avoid egregious OOM conditions. Other allocations, aka large,
|
||||
may be used by firmware only for one dispatch and are then surrendered to the runtime. This has
|
||||
significant latency so we don't want to make all scratch allocations large (ie single use).
|
||||
|
||||
Note that the designation "large" is for contrast with "small", which must really be small
|
||||
amounts of memory, and does not always imply a large quantity of memory is needed. Other
|
||||
properties of the allocation may require single use and so qualify the allocation or use as
|
||||
"large".
|
||||
|
||||
Here we decide on the boundaries for small scratch allocations. Both the largest small single
|
||||
allocation and the maximum amount of memory bound by small allocations are limited. Additionally
|
||||
some legacy devices do not support large scratch.
|
||||
|
||||
For small scratch we must allocate enough memory for every physical scratch slot.
|
||||
For large scratch compute the minimum memory needed to run the dispatch without limiting
|
||||
occupancy.
|
||||
Limit total bound small scratch allocations to 1/8th of scratch pool and 1/4 of that for a single
|
||||
allocation.
|
||||
*/
|
||||
size_t small_limit = scratch_pool_.size() >> 3;
|
||||
// Lift limit for 2.10 release RCCL workaround.
|
||||
size_t single_limit = 146800640; //small_limit >> 2;
|
||||
bool use_reclaim = true;
|
||||
bool large = (scratch.size > single_limit) ||
|
||||
(scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit);
|
||||
large = (isa_->GetMajorVersion() < 8) ? false : large;
|
||||
large = core::Runtime::runtime_singleton_->flag().no_scratch_reclaim() ? false : large;
|
||||
if (large)
|
||||
scratch.queue_base = scratch_pool_.alloc_high(scratch.size);
|
||||
else
|
||||
scratch.queue_base = scratch_pool_.alloc(scratch.size);
|
||||
large |= scratch.queue_base > scratch_pool_.high_split();
|
||||
scratch.large = large;
|
||||
if ((isa_->GetMajorVersion() < 8) ||
|
||||
core::Runtime::runtime_singleton_->flag().no_scratch_reclaim()) {
|
||||
large = false;
|
||||
use_reclaim = false;
|
||||
}
|
||||
|
||||
scratch.queue_process_offset =
|
||||
(need_queue_scratch_base)
|
||||
? uintptr_t(scratch.queue_base)
|
||||
: uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
|
||||
// If large is selected then the scratch will not be retained.
|
||||
// In that case allocate the minimum necessary for the dispatch since we don't need all slots.
|
||||
if (large) scratch.size = scratch.dispatch_size;
|
||||
|
||||
if (scratch.queue_base != nullptr) {
|
||||
if (profile_ == HSA_PROFILE_FULL) return;
|
||||
if (profile_ == HSA_PROFILE_BASE) {
|
||||
HSAuint64 alternate_va;
|
||||
if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
|
||||
HSAKMT_STATUS_SUCCESS) {
|
||||
if (large) scratch_used_large_ += scratch.size;
|
||||
return;
|
||||
// Ensure mapping will be in whole pages.
|
||||
scratch.size = AlignUp(scratch.size, 4096);
|
||||
|
||||
/*
|
||||
Sequence of attempts is:
|
||||
check cache
|
||||
attempt a new allocation
|
||||
trim unused blocks from cache
|
||||
attempt a new allocation
|
||||
check cache for sufficient used block, steal and wait (not implemented)
|
||||
trim used blocks from cache, evaluate retry
|
||||
reduce occupancy
|
||||
*/
|
||||
|
||||
// Lambda called in place.
|
||||
// Used to allow exit from nested loops.
|
||||
[&]() {
|
||||
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
|
||||
// Check scratch cache
|
||||
if (scratch_cache_.alloc(scratch)) return;
|
||||
|
||||
// Attempt new allocation.
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (large)
|
||||
scratch.queue_base = scratch_pool_.alloc_high(scratch.size);
|
||||
else
|
||||
scratch.queue_base = scratch_pool_.alloc(scratch.size);
|
||||
scratch.large = large | (scratch.queue_base > scratch_pool_.high_split());
|
||||
assert(((!scratch.large) | use_reclaim) && "Large scratch used with reclaim disabled.");
|
||||
|
||||
if (scratch.queue_base != nullptr) {
|
||||
if (profile_ == HSA_PROFILE_FULL) return;
|
||||
if (profile_ == HSA_PROFILE_BASE) {
|
||||
HSAuint64 alternate_va;
|
||||
if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
|
||||
HSAKMT_STATUS_SUCCESS) {
|
||||
if (scratch.large) scratch_used_large_ += scratch.size;
|
||||
scratch_cache_.insert(scratch);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Scratch request failed allocation or mapping.
|
||||
scratch_pool_.free(scratch.queue_base);
|
||||
scratch.queue_base = nullptr;
|
||||
|
||||
// Release cached scratch and retry.
|
||||
// First iteration trims unused blocks, second trims all.
|
||||
scratch_cache_.trim(i == 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Scratch request failed allocation or mapping.
|
||||
scratch_pool_.free(scratch.queue_base);
|
||||
scratch.queue_base = nullptr;
|
||||
|
||||
// Retry if large may yield needed space.
|
||||
if (scratch_used_large_ != 0) {
|
||||
if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// Fail scratch allocation if reducing occupancy is disabled.
|
||||
if (core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter()) return;
|
||||
|
||||
// Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
|
||||
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
|
||||
debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n",
|
||||
scratch.size);
|
||||
const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
|
||||
const uint64_t total_waves = scratch.size / size_per_wave;
|
||||
uint64_t waves_per_cu = total_waves / num_cus;
|
||||
while (waves_per_cu != 0) {
|
||||
size_t size = waves_per_cu * num_cus * size_per_wave;
|
||||
void* base = scratch_pool_.alloc(size);
|
||||
HSAuint64 alternate_va;
|
||||
if ((base != nullptr) &&
|
||||
((profile_ == HSA_PROFILE_FULL) ||
|
||||
(hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) {
|
||||
// Scratch allocated and either full profile or map succeeded.
|
||||
scratch.queue_base = base;
|
||||
scratch.size = size;
|
||||
scratch.queue_process_offset = (need_queue_scratch_base)
|
||||
? uintptr_t(scratch.queue_base)
|
||||
: uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
|
||||
scratch.large = true;
|
||||
scratch_used_large_ += scratch.size;
|
||||
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
|
||||
debug_print(" %ld scratch mapped, %.2f%% occupancy.\n", scratch.size,
|
||||
float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f);
|
||||
// Retry if large may yield needed space.
|
||||
if (scratch_used_large_ != 0) {
|
||||
if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true;
|
||||
return;
|
||||
}
|
||||
scratch_pool_.free(base);
|
||||
waves_per_cu--;
|
||||
}
|
||||
|
||||
// Failed to allocate minimal scratch
|
||||
assert(scratch.queue_base == nullptr && "bad scratch data");
|
||||
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
|
||||
debug_print(" Could not allocate scratch for one wave per CU.\n");
|
||||
// Fail scratch allocation if reducing occupancy is disabled.
|
||||
if ((!use_reclaim) || core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter())
|
||||
return;
|
||||
|
||||
// Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
|
||||
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
|
||||
debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n",
|
||||
scratch.size);
|
||||
const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
|
||||
const uint64_t total_waves = scratch.size / size_per_wave;
|
||||
uint64_t waves_per_cu = total_waves / num_cus;
|
||||
while (waves_per_cu != 0) {
|
||||
size_t size = waves_per_cu * num_cus * size_per_wave;
|
||||
void* base = scratch_pool_.alloc_high(size);
|
||||
HSAuint64 alternate_va;
|
||||
if ((base != nullptr) &&
|
||||
((profile_ == HSA_PROFILE_FULL) ||
|
||||
(hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) {
|
||||
// Scratch allocated and either full profile or map succeeded.
|
||||
scratch.queue_base = base;
|
||||
scratch.size = size;
|
||||
scratch.large = true;
|
||||
scratch_used_large_ += scratch.size;
|
||||
scratch_cache_.insert(scratch);
|
||||
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
|
||||
debug_print(" %ld scratch mapped, %.2f%% occupancy.\n", scratch.size,
|
||||
float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f);
|
||||
return;
|
||||
}
|
||||
scratch_pool_.free(base);
|
||||
waves_per_cu--;
|
||||
}
|
||||
|
||||
// Failed to allocate minimal scratch
|
||||
assert(scratch.queue_base == nullptr && "bad scratch data");
|
||||
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
|
||||
debug_print(" Could not allocate scratch for one wave per CU.\n");
|
||||
return;
|
||||
}();
|
||||
|
||||
scratch.queue_process_offset = need_queue_scratch_base
|
||||
? uintptr_t(scratch.queue_base)
|
||||
: uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
|
||||
}
|
||||
|
||||
void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
|
||||
if (scratch.queue_base == nullptr) {
|
||||
return;
|
||||
}
|
||||
if (scratch.queue_base == nullptr) return;
|
||||
|
||||
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
|
||||
scratch_cache_.free(scratch);
|
||||
scratch.queue_base = nullptr;
|
||||
}
|
||||
|
||||
void GpuAgent::ReleaseScratch(void* base, size_t size, bool large) {
|
||||
if (profile_ == HSA_PROFILE_BASE) {
|
||||
if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(scratch.queue_base)) {
|
||||
if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(base)) {
|
||||
assert(false && "Unmap scratch subrange failed!");
|
||||
}
|
||||
}
|
||||
scratch_pool_.free(scratch.queue_base);
|
||||
scratch.queue_base = nullptr;
|
||||
scratch_pool_.free(base);
|
||||
|
||||
if (scratch.large) scratch_used_large_ -= scratch.size;
|
||||
if (large) scratch_used_large_ -= size;
|
||||
|
||||
// Notify waiters that additional scratch may be available.
|
||||
for (auto notifier : scratch_notifiers_) {
|
||||
@@ -1444,5 +1510,11 @@ lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
|
||||
return GetXgmiBlit(dst_agent);
|
||||
}
|
||||
|
||||
void GpuAgent::Trim() {
|
||||
Agent::Trim();
|
||||
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
|
||||
scratch_cache_.trim(false);
|
||||
}
|
||||
|
||||
} // namespace amd
|
||||
} // namespace rocr
|
||||
|
||||
@@ -201,7 +201,7 @@ hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, voi
|
||||
// If it fails attempt to release memory from the block allocator and retry.
|
||||
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
|
||||
if (*address == nullptr) {
|
||||
fragment_allocator_.trim();
|
||||
owner()->Trim();
|
||||
*address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
|
||||
}
|
||||
|
||||
@@ -699,6 +699,8 @@ hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size,
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void MemoryRegion::Trim() const { fragment_allocator_.trim(); }
|
||||
|
||||
void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const {
|
||||
assert(request_size <= block_size() && "BlockAllocator alloc request exceeds block size.");
|
||||
|
||||
|
||||
Referens i nytt ärende
Block a user