clr: Update signal count and pool size for staging buffer (#2889)

* clr: Update signal count and pool size for staging buffer

* Change to naming of variables etc

---------

Co-authored-by: Rahul Manocha <rmanocha@amd.com>
Cette révision appartient à :
Rahul Manocha
2026-01-29 10:34:00 -08:00
révisé par GitHub
Parent 58c203e252
révision c4f7593001
2 fichiers modifiés avec 13 ajouts et 9 suppressions
+6 -6
Voir le fichier
@@ -1730,8 +1730,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
schedulerThreads_(0), schedulerThreads_(0),
schedulerQueue_(nullptr), schedulerQueue_(nullptr),
barriers_(*this), barriers_(*this),
managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_), managed_buffer_(*this, kStagingPoolNumSignals * device.settings().stagedXferSize_, kStagingPoolNumSignals),
managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_), managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_, kKernArgPoolNumSignals),
cuMask_(cuMask), cuMask_(cuMask),
priority_(priority), priority_(priority),
copy_command_type_(0), copy_command_type_(0),
@@ -1912,7 +1912,7 @@ VirtualGPU::ManagedBuffer::~ManagedBuffer() {
// ================================================================================================ // ================================================================================================
bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) { bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) {
pool_chunk_end_ = pool_size_ / kPoolNumSignals; pool_chunk_end_ = pool_size_ / num_chunk_signals_;
active_chunk_ = 0; active_chunk_ = 0;
// Allocate memory for managed buffer // Allocate memory for managed buffer
if (mem_segment == Device::MemorySegment::kKernArg && if (mem_segment == Device::MemorySegment::kKernArg &&
@@ -1965,14 +1965,14 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) {
// Dispatch a barrier packet into the queue // Dispatch a barrier packet into the queue
gpu_.dispatchBarrierPacket(kBarrierPacketHeader, kSkipSignal, pool_signal_[active_chunk_]); gpu_.dispatchBarrierPacket(kBarrierPacketHeader, kSkipSignal, pool_signal_[active_chunk_]);
// Get the next chunk // Get the next chunk
active_chunk_ = ++active_chunk_ % kPoolNumSignals; active_chunk_ = ++active_chunk_ % num_chunk_signals_;
// Make sure the new active chunk is free // Make sure the new active chunk is free
bool test = WaitForSignal(pool_signal_[active_chunk_], gpu_.ActiveWait()); bool test = WaitForSignal(pool_signal_[active_chunk_], gpu_.ActiveWait());
assert(test && "Runtime can't fail a wait for chunk!"); assert(test && "Runtime can't fail a wait for chunk!");
// Make sure the current offset matches the new chunk to avoid possible overlaps // Make sure the current offset matches the new chunk to avoid possible overlaps
// between chunks and issues during recycle // between chunks and issues during recycle
pool_cur_offset_ = (active_chunk_ == 0) ? 0 : pool_chunk_end_; pool_cur_offset_ = (active_chunk_ == 0) ? 0 : pool_chunk_end_;
pool_chunk_end_ = pool_cur_offset_ + pool_size_ / kPoolNumSignals; pool_chunk_end_ = pool_cur_offset_ + pool_size_ / num_chunk_signals_;
result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment); result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
pool_cur_offset_ = (result + size) - pool_base_; pool_cur_offset_ = (result + size) - pool_base_;
} }
@@ -1983,7 +1983,7 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) {
// ================================================================================================ // ================================================================================================
void VirtualGPU::ManagedBuffer::ResetPool() { void VirtualGPU::ManagedBuffer::ResetPool() {
pool_cur_offset_ = 0; pool_cur_offset_ = 0;
pool_chunk_end_ = pool_size_ / kPoolNumSignals; pool_chunk_end_ = pool_size_ / num_chunk_signals_;
active_chunk_ = 0; active_chunk_ = 0;
} }
+7 -3
Voir le fichier
@@ -203,9 +203,9 @@ class VirtualGPU : public device::VirtualDevice {
class ManagedBuffer : public amd::EmbeddedObject { class ManagedBuffer : public amd::EmbeddedObject {
public: public:
//! The number of chunks the arg pool will be divided //! The number of chunks the arg pool will be divided
static constexpr uint32_t kPoolNumSignals = 16; ManagedBuffer(VirtualGPU& gpu, uint32_t pool_size, uint32_t num_signals)
ManagedBuffer(VirtualGPU& gpu, uint32_t pool_size) : gpu_(gpu), pool_size_(pool_size), pool_signal_(num_signals),
: gpu_(gpu), pool_size_(pool_size), pool_signal_(kPoolNumSignals) {} num_chunk_signals_(num_signals) {}
~ManagedBuffer(); ~ManagedBuffer();
//! Allocates all necessary resources to manage memory //! Allocates all necessary resources to manage memory
@@ -228,6 +228,7 @@ class VirtualGPU : public device::VirtualDevice {
uint32_t active_chunk_ = 0; //!< The index of the current active chunk uint32_t active_chunk_ = 0; //!< The index of the current active chunk
uint32_t pool_cur_offset_ = 0; //!< Current active offset for update uint32_t pool_cur_offset_ = 0; //!< Current active offset for update
std::vector<hsa_signal_t> pool_signal_; //!< Pool of HSA signals to manage multiple chunks std::vector<hsa_signal_t> pool_signal_; //!< Pool of HSA signals to manage multiple chunks
uint32_t num_chunk_signals_; //!< Number of signals used per chunk
}; };
class MemoryDependency : public amd::EmbeddedObject { class MemoryDependency : public amd::EmbeddedObject {
public: public:
@@ -622,6 +623,9 @@ class VirtualGPU : public device::VirtualDevice {
ManagedBuffer managed_buffer_; //!< Memory manager for staging copies ManagedBuffer managed_buffer_; //!< Memory manager for staging copies
ManagedBuffer managed_kernarg_buffer_; //!< Managed memory for kernel args ManagedBuffer managed_kernarg_buffer_; //!< Managed memory for kernel args
static constexpr uint32_t kStagingPoolNumSignals = 4; //!< Hsa Signal count for Staging Buffer
static constexpr uint32_t kKernArgPoolNumSignals = 16; //!< Hsa Signal count for KernArg Buffer
friend class Timestamp; friend class Timestamp;
// PM4 packet for gfx8 performance counter // PM4 packet for gfx8 performance counter