diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 09db38f4fd..ed04665560 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1313,6 +1313,7 @@ class VirtualDevice : public amd::HeapObject { virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); } virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; } + virtual void ReleaseHwQueue() {} //! Get the blit manager object device::BlitManager& blitMgr() const { return *blitMgr_; } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 219cc3a46c..0ecb1e7c59 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -2876,10 +2876,21 @@ hsa_queue_t* Device::getQueueFromPool(const uint qIndex) { return nullptr; } +// ================================================================================================ +hsa_queue_t* Device::AcquireActiveNormalQueue() { + uint32_t queue_size = ROC_AQL_QUEUE_SIZE; + auto queue = acquireQueue( + queue_size, false, std::vector{}, amd::CommandQueue::Priority::Normal, true); + return queue; +} + // ================================================================================================ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue, const std::vector& cuMask, - amd::CommandQueue::Priority priority) { + amd::CommandQueue::Priority priority, + bool managed) { + amd::ScopedLock l(active_queue_access_); + assert(queuePool_[QueuePriority::Low].size() <= GPU_MAX_HW_QUEUES || queuePool_[QueuePriority::Normal].size() <= GPU_MAX_HW_QUEUES || queuePool_[QueuePriority::High].size() <= GPU_MAX_HW_QUEUES); @@ -2916,6 +2927,9 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue, ((queuePool_[qIndex].size() == GPU_MAX_HW_QUEUES) || queuePool_[qIndex].size() > 0)) { hsa_queue_t* queue = getQueueFromPool(qIndex); if (queue != nullptr) { + if (!managed && (qIndex == QueuePriority::Normal)) { + num_normal_queues_++; + } return queue; } } @@ -3057,13 +3071,33 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue, qInfo.refCount = 1; ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "acquireQueue refCount: %p (%d)", result.first->first->base_address, result.first->second.refCount); + if (!managed && (cuMask.size() == 0) && (qIndex = QueuePriority::Normal)) { + num_normal_queues_++; + } return queue; } -void Device::releaseQueue(hsa_queue_t* queue, const std::vector& cuMask, bool coop_queue) { +// ================================================================================================ +bool Device::ReleaseActiveNormalQueue(hsa_queue_t* queue) { + // Release a queue if the total number of allocated queues exceeds the max possible + if (num_normal_queues_.load() > GPU_MAX_HW_QUEUES) { + releaseQueue(queue, std::vector{}, false, true); + return true; + } else { + return false; + } +} + +// ================================================================================================ +void Device::releaseQueue(hsa_queue_t* queue, const std::vector& cuMask, + bool coop_queue, bool managed) { + amd::ScopedLock l(active_queue_access_); for (auto& it : cuMask.size() == 0 ? queuePool_ : queueWithCUMaskPool_) { auto qIter = it.find(queue); if (qIter != it.end()) { + if (!managed && (cuMask.size() == 0) && (&it == &queuePool_[QueuePriority::Normal])) { + num_normal_queues_--; + } auto &qInfo = qIter->second; assert(qInfo.refCount > 0); qInfo.refCount--; diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index b1307486e8..8980d98530 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -511,10 +511,15 @@ class Device : public NullDevice { //! share previously created hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false, const std::vector& cuMask = {}, - amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal); + amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal, + bool managed = false); //! Release HSA queue - void releaseQueue(hsa_queue_t*, const std::vector& cuMask = {}, bool coop_queue = false); + void releaseQueue(hsa_queue_t*, const std::vector& cuMask = {}, + bool coop_queue = false, bool managed = false); + + hsa_queue_t* AcquireActiveNormalQueue(); + bool ReleaseActiveNormalQueue(hsa_queue_t* queue); //! For the given HSA queue, return an existing hostcall buffer or create a //! new one. queuePool_ keeps a mapping from HSA queue to hostcall buffer. @@ -620,6 +625,8 @@ class Device : public NullDevice { }; //! a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling std::vector> queuePool_; + amd::Monitor active_queue_access_; //!< Lock to serialise virtual gpu list access + std::atomic num_normal_queues_{0}; //!< The total number of allocated normal queues //! returns a hsa queue from queuePool with least refCount and updates the refCount as well hsa_queue_t* getQueueFromPool(const uint qIndex); diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index c68065f9f5..5ddf853dd3 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -95,6 +95,8 @@ Settings::Settings() { kernel_arg_impl_ = KernelArgImpl::HostKernelArgs; gwsInitSupported_ = true; limit_blit_wg_ = 16; + + dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false; } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp index b4da5df0d5..dd323b86ab 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp @@ -50,7 +50,8 @@ class Settings : public device::Settings { uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment uint barrier_value_packet_ : 1; //!< Barrier value packet functionality - uint reserved_ : 23; + uint dynamic_queues_ : 1; //!< Dynamic queues management + uint reserved_ : 22; }; uint value_; }; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 7f43aaef35..6881654509 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -883,6 +883,15 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para return true; } +// ================================================================================================ +uint64_t VirtualGPU::getQueueID() { + amd::ScopedLock lock(execution()); + if (gpu_queue_ == nullptr) { + gpu_queue_ = roc_device_.AcquireActiveNormalQueue(); + } + return gpu_queue_->id; +} + // ================================================================================================ static inline void packet_store_release(uint32_t* packet, uint16_t header, uint16_t rest) { __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE); @@ -1463,7 +1472,7 @@ VirtualGPU::~VirtualGPU() { roc_device_.vgpus()[idx]->index_--; } - if (gpu_queue_) { + if (gpu_queue_ != nullptr) { roc_device_.releaseQueue(gpu_queue_, cuMask_, cooperative_); } } @@ -1522,6 +1531,8 @@ bool VirtualGPU::create() { LogError("Could not create managed buffer for this queue!"); return false; } + // Release HW queue until the first usage + ReleaseHwQueue(); return true; } @@ -1607,7 +1618,6 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) { return result; } - // ================================================================================================ void VirtualGPU::ManagedBuffer::ResetPool() { pool_cur_offset_ = 0; @@ -1631,12 +1641,27 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) { } } +// ================================================================================================ +void VirtualGPU::ReleaseHwQueue() { + // Try to release normal queue to the pool of active queues + if (roc_device_.settings().dynamic_queues_ && + (priority_ == amd::CommandQueue::Priority::Normal)) { + amd::ScopedLock lock(execution()); + if ((gpu_queue_ != nullptr) && roc_device_.ReleaseActiveNormalQueue(gpu_queue_)) { + gpu_queue_ = nullptr; + } + } +} + // ================================================================================================ /* profilingBegin, when profiling is enabled, creates a timestamp to save in * virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data * and then calls start() to get the current host timestamp. */ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) { + if (gpu_queue_ == nullptr) { + gpu_queue_ = roc_device_.AcquireActiveNormalQueue(); + } // Track the current command command_ = &command; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index c243f4cb71..43c927ff6e 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -376,6 +376,7 @@ class VirtualGPU : public device::VirtualDevice { virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd){} virtual address allocKernelArguments(size_t size, size_t alignment) final; + virtual void ReleaseHwQueue() final; /** * @brief Waits on an outstanding kernel without regard to how @@ -436,7 +437,7 @@ class VirtualGPU : public device::VirtualDevice { void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; } uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); } - uint64_t getQueueID() { return gpu_queue_->id; } + uint64_t getQueueID(); //! Analyzes a crashed AQL queue to find a broken AQL packet void AnalyzeAqlQueue() const; @@ -531,7 +532,7 @@ class VirtualGPU : public device::VirtualDevice { Timestamp* timestamp_; amd::Command* command_; //!< Current command hsa_agent_t gpu_device_; //!< Physical device - hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu + hsa_queue_t* gpu_queue_; //!< Active queue associated with a vgpu hsa_barrier_and_packet_t barrier_packet_; hsa_amd_barrier_value_packet_t barrier_value_packet_; diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp index a6d7077358..77aa26eb7a 100644 --- a/projects/clr/rocclr/platform/commandqueue.cpp +++ b/projects/clr/rocclr/platform/commandqueue.cpp @@ -197,6 +197,8 @@ void HostQueue::finish(bool cpu_wait) { } } } + // Release HW queue to the pool for dynamic management if enabled + vdev()->ReleaseHwQueue(); command->release(); ClPrint(LOG_DEBUG, LOG_CMD, "All commands finished for host queue : %p", this); diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index 50edf21e11..24903a48fa 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -273,6 +273,8 @@ release(bool, DEBUG_CLR_KERNARG_HDP_FLUSH_WA, false, \ "Toggle kernel arg copy workaround") \ release(bool, DEBUG_CLR_SKIP_RELEASE_SCOPE, false, \ "Forces release scope to SCOPE_NONE for aql packets") \ +release(bool, DEBUG_HIP_DYNAMIC_QUEUES, true, \ + "Forces dynamic queue management") \ release(uint, DEBUG_HIP_7_PREVIEW, 0, \ "Enables specific backward incompatible changes support before 7.0," \ "using the mask. By default the changes are disabled and is set to 0")\