SWDEV-528808 - Release all HW queues even if only one is idle (#240)
Pytorch may not explicitly idle each queue. Thus, some queues can be considered as busy,
but have idle state in reality
[ROCm/clr commit: 65a0181a7c]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
ee91a1e94a
Коммит
3ea758a2d4
@@ -1326,6 +1326,7 @@ class VirtualDevice : public amd::HeapObject {
|
||||
virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
|
||||
|
||||
virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
|
||||
virtual void ReleaseAllHwQueues() {}
|
||||
virtual void ReleaseHwQueue() {}
|
||||
|
||||
//! Get the blit manager object
|
||||
|
||||
@@ -567,6 +567,9 @@ class Device : public NullDevice {
|
||||
//! Removes a kernel from the kernel map
|
||||
void RemoveKernel(Kernel& gpuKernel) const;
|
||||
|
||||
// Returns the number of allocated normal queues on this device
|
||||
uint32_t NumNormalQueues() const { return num_normal_queues_.load(); }
|
||||
|
||||
private:
|
||||
bool create();
|
||||
|
||||
|
||||
@@ -1066,6 +1066,8 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
blocking = true;
|
||||
}
|
||||
|
||||
TrackQueueProgress(*packet, index);
|
||||
|
||||
AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];
|
||||
*aql_loc = *packet;
|
||||
if (header != 0) {
|
||||
@@ -1241,6 +1243,8 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
|
||||
barrier_packet_.completion_signal = signal;
|
||||
}
|
||||
|
||||
TrackQueueProgress(barrier_packet_, index);
|
||||
|
||||
// Reset fence_dirty_ flag if we submit a barrier with system scopes
|
||||
if (cache_state == amd::Device::kCacheStateSystem) {
|
||||
fence_dirty_ = false;
|
||||
@@ -1336,6 +1340,8 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
|
||||
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
|
||||
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
|
||||
|
||||
TrackQueueProgress(barrier_value_packet_, index);
|
||||
|
||||
while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
|
||||
hsa_amd_barrier_value_packet_t* aql_loc = &(reinterpret_cast<hsa_amd_barrier_value_packet_t*>(
|
||||
gpu_queue_->base_address))[index & queueMask];
|
||||
@@ -1465,6 +1471,10 @@ VirtualGPU::~VirtualGPU() {
|
||||
delete blitMgr_;
|
||||
|
||||
if (tracking_created_) {
|
||||
amd::ScopedLock l(execution());
|
||||
if (gpu_queue_ == nullptr) {
|
||||
gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
|
||||
}
|
||||
// Release the resources of signal
|
||||
releaseGpuMemoryFence();
|
||||
}
|
||||
@@ -1673,6 +1683,18 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::ReleaseAllHwQueues() {
|
||||
if (roc_device_.settings().dynamic_queues_ &&
|
||||
(roc_device_.NumNormalQueues() > GPU_MAX_HW_QUEUES)) {
|
||||
// Lock the device to make the following thread safe
|
||||
amd::ScopedLock lock(roc_device_.vgpusAccess());
|
||||
for (uint idx = 0; idx < roc_device_.vgpus().size(); ++idx) {
|
||||
roc_device_.vgpus()[idx]->ReleaseHwQueue();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::ReleaseHwQueue() {
|
||||
// Try to release normal queue to the pool of active queues
|
||||
@@ -1680,8 +1702,12 @@ void VirtualGPU::ReleaseHwQueue() {
|
||||
(priority_ == amd::CommandQueue::Priority::Normal) &&
|
||||
!cooperative_ && (cuMask_.size() == 0)) {
|
||||
amd::ScopedLock lock(execution());
|
||||
if ((gpu_queue_ != nullptr) && roc_device_.ReleaseActiveNormalQueue(gpu_queue_)) {
|
||||
gpu_queue_ = nullptr;
|
||||
if (gpu_queue_ != nullptr) {
|
||||
if (IsQueueIdle()) {
|
||||
if (roc_device_.ReleaseActiveNormalQueue(gpu_queue_)) {
|
||||
gpu_queue_ = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -390,6 +390,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd){}
|
||||
|
||||
virtual address allocKernelArguments(size_t size, size_t alignment) final;
|
||||
virtual void ReleaseAllHwQueues() final;
|
||||
virtual void ReleaseHwQueue() final;
|
||||
|
||||
/**
|
||||
@@ -529,6 +530,33 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Resets the current queue state. Note: should be called after AQL queue becomes idle
|
||||
void ResetQueueStates();
|
||||
|
||||
//! Track the progress of the queue based on the last write index and completion signal
|
||||
template <typename AqlPacket>
|
||||
inline void TrackQueueProgress(const AqlPacket& packet, uint64_t index) {
|
||||
// Track the progress of the current virtual queue
|
||||
last_write_index_ = index;
|
||||
// Update the last completion signal if the packet has one
|
||||
if (packet.completion_signal.handle != 0) {
|
||||
last_barrier_index_ = index;
|
||||
last_completion_signal_ = packet.completion_signal;
|
||||
}
|
||||
}
|
||||
|
||||
//! Returns true if the queue is considered as idle. That means all submitted packets are complete.
|
||||
//! Note: it doesn't track the state of caches
|
||||
bool IsQueueIdle() const {
|
||||
bool result = false;
|
||||
// Make sure the last packet contained a completion signal
|
||||
if (last_barrier_index_ == last_write_index_) {
|
||||
if ((last_write_index_ == 0) && (last_completion_signal_.handle == 0)) {
|
||||
result = true;
|
||||
} else {
|
||||
result = (hsa_signal_load_relaxed(last_completion_signal_) == 0);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
|
||||
|
||||
//! Queue state flags
|
||||
@@ -595,6 +623,10 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
bool fence_dirty_; //!< Fence modified flag
|
||||
|
||||
std::atomic<uint> lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask
|
||||
uint64_t last_write_index_ = 0; //!< The last HW queue write index for any packet
|
||||
uint64_t last_barrier_index_ = 0; //!< The last HW queue write index for a packet
|
||||
//!< with a complition signal
|
||||
hsa_signal_t last_completion_signal_{}; //!< The last completion signal
|
||||
|
||||
using KernelArgImpl = device::Settings::KernelArgImpl;
|
||||
};
|
||||
|
||||
@@ -195,10 +195,13 @@ void HostQueue::finish(bool cpu_wait) {
|
||||
device_.removeFromActiveQueues(this);
|
||||
lastEnqueueCommand_->release();
|
||||
lastEnqueueCommand_ = nullptr;
|
||||
vdev()->ReleaseHwQueue(); // we can only release HwQueue when no commmand in quque.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Release all HW queues, which are idle or nearly idle
|
||||
vdev()->ReleaseAllHwQueues();
|
||||
|
||||
command->release();
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "All commands finished for host queue : %p", this);
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user