clr: Update signal count and pool size for staging buffer (#2889)

* clr: Update signal count and pool size for staging buffer

* Change to naming of variables etc

---------

Co-authored-by: Rahul Manocha <rmanocha@amd.com>
Tento commit je obsažen v:
Rahul Manocha
2026-01-29 10:34:00 -08:00
odevzdal GitHub
rodič 58c203e252
revize c4f7593001
2 změnil soubory, kde provedl 13 přidání a 9 odebrání
+6 -6
Zobrazit soubor
@@ -1730,8 +1730,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
schedulerThreads_(0),
schedulerQueue_(nullptr),
barriers_(*this),
managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_),
managed_buffer_(*this, kStagingPoolNumSignals * device.settings().stagedXferSize_, kStagingPoolNumSignals),
managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_, kKernArgPoolNumSignals),
cuMask_(cuMask),
priority_(priority),
copy_command_type_(0),
@@ -1912,7 +1912,7 @@ VirtualGPU::ManagedBuffer::~ManagedBuffer() {
// ================================================================================================
bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) {
pool_chunk_end_ = pool_size_ / kPoolNumSignals;
pool_chunk_end_ = pool_size_ / num_chunk_signals_;
active_chunk_ = 0;
// Allocate memory for managed buffer
if (mem_segment == Device::MemorySegment::kKernArg &&
@@ -1965,14 +1965,14 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) {
// Dispatch a barrier packet into the queue
gpu_.dispatchBarrierPacket(kBarrierPacketHeader, kSkipSignal, pool_signal_[active_chunk_]);
// Get the next chunk
active_chunk_ = ++active_chunk_ % kPoolNumSignals;
active_chunk_ = ++active_chunk_ % num_chunk_signals_;
// Make sure the new active chunk is free
bool test = WaitForSignal(pool_signal_[active_chunk_], gpu_.ActiveWait());
assert(test && "Runtime can't fail a wait for chunk!");
// Make sure the current offset matches the new chunk to avoid possible overlaps
// between chunks and issues during recycle
pool_cur_offset_ = (active_chunk_ == 0) ? 0 : pool_chunk_end_;
pool_chunk_end_ = pool_cur_offset_ + pool_size_ / kPoolNumSignals;
pool_chunk_end_ = pool_cur_offset_ + pool_size_ / num_chunk_signals_;
result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
pool_cur_offset_ = (result + size) - pool_base_;
}
@@ -1983,7 +1983,7 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) {
// ================================================================================================
void VirtualGPU::ManagedBuffer::ResetPool() {
pool_cur_offset_ = 0;
pool_chunk_end_ = pool_size_ / kPoolNumSignals;
pool_chunk_end_ = pool_size_ / num_chunk_signals_;
active_chunk_ = 0;
}
+7 -3
Zobrazit soubor
@@ -203,9 +203,9 @@ class VirtualGPU : public device::VirtualDevice {
class ManagedBuffer : public amd::EmbeddedObject {
public:
//! The number of chunks the arg pool will be divided
static constexpr uint32_t kPoolNumSignals = 16;
ManagedBuffer(VirtualGPU& gpu, uint32_t pool_size)
: gpu_(gpu), pool_size_(pool_size), pool_signal_(kPoolNumSignals) {}
ManagedBuffer(VirtualGPU& gpu, uint32_t pool_size, uint32_t num_signals)
: gpu_(gpu), pool_size_(pool_size), pool_signal_(num_signals),
num_chunk_signals_(num_signals) {}
~ManagedBuffer();
//! Allocates all necessary resources to manage memory
@@ -228,6 +228,7 @@ class VirtualGPU : public device::VirtualDevice {
uint32_t active_chunk_ = 0; //!< The index of the current active chunk
uint32_t pool_cur_offset_ = 0; //!< Current active offset for update
std::vector<hsa_signal_t> pool_signal_; //!< Pool of HSA signals to manage multiple chunks
uint32_t num_chunk_signals_; //!< Number of signals used per chunk
};
class MemoryDependency : public amd::EmbeddedObject {
public:
@@ -622,6 +623,9 @@ class VirtualGPU : public device::VirtualDevice {
ManagedBuffer managed_buffer_; //!< Memory manager for staging copies
ManagedBuffer managed_kernarg_buffer_; //!< Managed memory for kernel args
static constexpr uint32_t kStagingPoolNumSignals = 4; //!< Hsa Signal count for Staging Buffer
static constexpr uint32_t kKernArgPoolNumSignals = 16; //!< Hsa Signal count for KernArg Buffer
friend class Timestamp;
// PM4 packet for gfx8 performance counter