diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index 523eb56385..45e0ac1dfa 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -1326,6 +1326,7 @@ class VirtualDevice : public amd::HeapObject { virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); } virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; } + virtual void ReleaseAllHwQueues() {} virtual void ReleaseHwQueue() {} //! Get the blit manager object diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index fd366928ee..c1a7c3b920 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -567,6 +567,9 @@ class Device : public NullDevice { //! Removes a kernel from the kernel map void RemoveKernel(Kernel& gpuKernel) const; + // Returns the number of allocated normal queues on this device + uint32_t NumNormalQueues() const { return num_normal_queues_.load(); } + private: bool create(); diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index f51fd35e52..8d323c2e77 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -1066,6 +1066,8 @@ bool VirtualGPU::dispatchGenericAqlPacket( blocking = true; } + TrackQueueProgress(*packet, index); + AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask]; *aql_loc = *packet; if (header != 0) { @@ -1241,6 +1243,8 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, barrier_packet_.completion_signal = signal; } + TrackQueueProgress(barrier_packet_, index); + // Reset fence_dirty_ flag if we submit a barrier with system scopes if (cache_state == amd::Device::kCacheStateSystem) { fence_dirty_ = false; @@ -1336,6 +1340,8 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); + TrackQueueProgress(barrier_value_packet_, index); + while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); hsa_amd_barrier_value_packet_t* aql_loc = &(reinterpret_cast( gpu_queue_->base_address))[index & queueMask]; @@ -1465,6 +1471,10 @@ VirtualGPU::~VirtualGPU() { delete blitMgr_; if (tracking_created_) { + amd::ScopedLock l(execution()); + if (gpu_queue_ == nullptr) { + gpu_queue_ = roc_device_.AcquireActiveNormalQueue(); + } // Release the resources of signal releaseGpuMemoryFence(); } @@ -1673,6 +1683,18 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) { } } +// ================================================================================================ +void VirtualGPU::ReleaseAllHwQueues() { + if (roc_device_.settings().dynamic_queues_ && + (roc_device_.NumNormalQueues() > GPU_MAX_HW_QUEUES)) { + // Lock the device to make the following thread safe + amd::ScopedLock lock(roc_device_.vgpusAccess()); + for (uint idx = 0; idx < roc_device_.vgpus().size(); ++idx) { + roc_device_.vgpus()[idx]->ReleaseHwQueue(); + } + } +} + // ================================================================================================ void VirtualGPU::ReleaseHwQueue() { // Try to release normal queue to the pool of active queues @@ -1680,8 +1702,12 @@ void VirtualGPU::ReleaseHwQueue() { (priority_ == amd::CommandQueue::Priority::Normal) && !cooperative_ && (cuMask_.size() == 0)) { amd::ScopedLock lock(execution()); - if ((gpu_queue_ != nullptr) && roc_device_.ReleaseActiveNormalQueue(gpu_queue_)) { - gpu_queue_ = nullptr; + if (gpu_queue_ != nullptr) { + if (IsQueueIdle()) { + if (roc_device_.ReleaseActiveNormalQueue(gpu_queue_)) { + gpu_queue_ = nullptr; + } + } } } } diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index bc683ecff8..83f80824ba 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -390,6 +390,7 @@ class VirtualGPU : public device::VirtualDevice { virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd){} virtual address allocKernelArguments(size_t size, size_t alignment) final; + virtual void ReleaseAllHwQueues() final; virtual void ReleaseHwQueue() final; /** @@ -529,6 +530,33 @@ class VirtualGPU : public device::VirtualDevice { //! Resets the current queue state. Note: should be called after AQL queue becomes idle void ResetQueueStates(); + //! Track the progress of the queue based on the last write index and completion signal + template + inline void TrackQueueProgress(const AqlPacket& packet, uint64_t index) { + // Track the progress of the current virtual queue + last_write_index_ = index; + // Update the last completion signal if the packet has one + if (packet.completion_signal.handle != 0) { + last_barrier_index_ = index; + last_completion_signal_ = packet.completion_signal; + } + } + + //! Returns true if the queue is considered as idle. That means all submitted packets are complete. + //! Note: it doesn't track the state of caches + bool IsQueueIdle() const { + bool result = false; + // Make sure the last packet contained a completion signal + if (last_barrier_index_ == last_write_index_) { + if ((last_write_index_ == 0) && (last_completion_signal_.handle == 0)) { + result = true; + } else { + result = (hsa_signal_load_relaxed(last_completion_signal_) == 0); + } + } + return result; + } + std::vector pinnedMems_; //!< Pinned memory list //! Queue state flags @@ -595,6 +623,10 @@ class VirtualGPU : public device::VirtualDevice { bool fence_dirty_; //!< Fence modified flag std::atomic lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask + uint64_t last_write_index_ = 0; //!< The last HW queue write index for any packet + uint64_t last_barrier_index_ = 0; //!< The last HW queue write index for a packet + //!< with a complition signal + hsa_signal_t last_completion_signal_{}; //!< The last completion signal using KernelArgImpl = device::Settings::KernelArgImpl; }; diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index 5382f2e1e7..fafc89f472 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -195,10 +195,13 @@ void HostQueue::finish(bool cpu_wait) { device_.removeFromActiveQueues(this); lastEnqueueCommand_->release(); lastEnqueueCommand_ = nullptr; - vdev()->ReleaseHwQueue(); // we can only release HwQueue when no commmand in quque. } } } + + // Release all HW queues, which are idle or nearly idle + vdev()->ReleaseAllHwQueues(); + command->release(); ClPrint(LOG_DEBUG, LOG_CMD, "All commands finished for host queue : %p", this); }