diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 4f05df6355..d9b79fb212 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1730,8 +1730,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, schedulerThreads_(0), schedulerQueue_(nullptr), barriers_(*this), - managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_), - managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_), + managed_buffer_(*this, kStagingPoolNumSignals * device.settings().stagedXferSize_, kStagingPoolNumSignals), + managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_, kKernArgPoolNumSignals), cuMask_(cuMask), priority_(priority), copy_command_type_(0), @@ -1912,7 +1912,7 @@ VirtualGPU::ManagedBuffer::~ManagedBuffer() { // ================================================================================================ bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) { - pool_chunk_end_ = pool_size_ / kPoolNumSignals; + pool_chunk_end_ = pool_size_ / num_chunk_signals_; active_chunk_ = 0; // Allocate memory for managed buffer if (mem_segment == Device::MemorySegment::kKernArg && @@ -1965,14 +1965,14 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) { // Dispatch a barrier packet into the queue gpu_.dispatchBarrierPacket(kBarrierPacketHeader, kSkipSignal, pool_signal_[active_chunk_]); // Get the next chunk - active_chunk_ = ++active_chunk_ % kPoolNumSignals; + active_chunk_ = ++active_chunk_ % num_chunk_signals_; // Make sure the new active chunk is free bool test = WaitForSignal(pool_signal_[active_chunk_], gpu_.ActiveWait()); assert(test && "Runtime can't fail a wait for chunk!"); // Make sure the current offset matches the new chunk to avoid possible overlaps // between chunks and issues during recycle pool_cur_offset_ = (active_chunk_ == 0) ? 0 : pool_chunk_end_; - pool_chunk_end_ = pool_cur_offset_ + pool_size_ / kPoolNumSignals; + pool_chunk_end_ = pool_cur_offset_ + pool_size_ / num_chunk_signals_; result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment); pool_cur_offset_ = (result + size) - pool_base_; } @@ -1983,7 +1983,7 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) { // ================================================================================================ void VirtualGPU::ManagedBuffer::ResetPool() { pool_cur_offset_ = 0; - pool_chunk_end_ = pool_size_ / kPoolNumSignals; + pool_chunk_end_ = pool_size_ / num_chunk_signals_; active_chunk_ = 0; } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index e9dee419b8..dd36d49ad6 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -203,9 +203,9 @@ class VirtualGPU : public device::VirtualDevice { class ManagedBuffer : public amd::EmbeddedObject { public: //! The number of chunks the arg pool will be divided - static constexpr uint32_t kPoolNumSignals = 16; - ManagedBuffer(VirtualGPU& gpu, uint32_t pool_size) - : gpu_(gpu), pool_size_(pool_size), pool_signal_(kPoolNumSignals) {} + ManagedBuffer(VirtualGPU& gpu, uint32_t pool_size, uint32_t num_signals) + : gpu_(gpu), pool_size_(pool_size), pool_signal_(num_signals), + num_chunk_signals_(num_signals) {} ~ManagedBuffer(); //! Allocates all necessary resources to manage memory @@ -228,6 +228,7 @@ class VirtualGPU : public device::VirtualDevice { uint32_t active_chunk_ = 0; //!< The index of the current active chunk uint32_t pool_cur_offset_ = 0; //!< Current active offset for update std::vector pool_signal_; //!< Pool of HSA signals to manage multiple chunks + uint32_t num_chunk_signals_; //!< Number of signals used per chunk }; class MemoryDependency : public amd::EmbeddedObject { public: @@ -622,6 +623,9 @@ class VirtualGPU : public device::VirtualDevice { ManagedBuffer managed_buffer_; //!< Memory manager for staging copies ManagedBuffer managed_kernarg_buffer_; //!< Managed memory for kernel args + static constexpr uint32_t kStagingPoolNumSignals = 4; //!< Hsa Signal count for Staging Buffer + static constexpr uint32_t kKernArgPoolNumSignals = 16; //!< Hsa Signal count for KernArg Buffer + friend class Timestamp; // PM4 packet for gfx8 performance counter