diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 16a37f3c4c..b0e8e2e259 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -171,7 +171,8 @@ Device::Device(hsa_agent_t bkendDevice) , freeMem_(0) , vgpusAccess_("Virtual GPU List Ops Lock", true) , hsa_exclusive_gpu_access_(false) - , numOfVgpus_(0) { + , numOfVgpus_(0) + , queuePool_(QueuePriority::Total) { group_segment_.handle = 0; system_segment_.handle = 0; system_coarse_segment_.handle = 0; @@ -1503,8 +1504,11 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { // Initialization of heap and other resources occur during the command // queue creation time. const std::vector defaultCuMask = {}; + bool q = (queue != nullptr); VirtualGPU* virtualDevice = new VirtualGPU(*this, profiling, cooperative, - (queue != nullptr) ? queue->cuMask() : defaultCuMask); + q ? queue->cuMask() : defaultCuMask, + q ? queue->priority() + : amd::CommandQueue::Priority::Normal); if (!virtualDevice->create()) { delete virtualDevice; @@ -1935,17 +1939,43 @@ static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) { } hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue, - const std::vector& cuMask) { - assert(queuePool_.size() <= GPU_MAX_HW_QUEUES); - ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d", - queuePool_.size(), GPU_MAX_HW_QUEUES); + const std::vector& cuMask, + amd::CommandQueue::Priority priority) { + assert(queuePool_[QueuePriority::Low].size() <= GPU_MAX_HW_QUEUES || + queuePool_[QueuePriority::Normal].size() <= GPU_MAX_HW_QUEUES || + queuePool_[QueuePriority::High].size() <= GPU_MAX_HW_QUEUES); - // If we have reached the max number of queues, reuse an existing queue, + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues with low priority: %d," + " with normal priority: %d, with high priority: %d, maximum per priority is: %d", + queuePool_[QueuePriority::Low].size(), + queuePool_[QueuePriority::Normal].size(), + queuePool_[QueuePriority::High].size(), GPU_MAX_HW_QUEUES); + + hsa_amd_queue_priority_t queue_priority; + uint qIndex; + switch (priority) { + case amd::CommandQueue::Priority::Low: + queue_priority = HSA_AMD_QUEUE_PRIORITY_LOW; + qIndex = QueuePriority::Low; + break; + case amd::CommandQueue::Priority::High: + queue_priority = HSA_AMD_QUEUE_PRIORITY_HIGH; + qIndex = QueuePriority::High; + break; + case amd::CommandQueue::Priority::Normal: + case amd::CommandQueue::Priority::Medium: + default: + queue_priority = HSA_AMD_QUEUE_PRIORITY_NORMAL; + qIndex = QueuePriority::Normal; + break; + } + + // If we have reached the max number of queues, reuse an existing queue with the matching queue priority, // choosing the one with the least number of users. // Note: Don't attempt to reuse the cooperative queue, since it's single per device - if (!coop_queue && (cuMask.size() == 0) && (queuePool_.size() == GPU_MAX_HW_QUEUES)) { - typedef decltype(queuePool_)::const_reference PoolRef; - auto lowest = std::min_element(queuePool_.begin(), queuePool_.end(), + if (!coop_queue && (cuMask.size() == 0) && (queuePool_[qIndex].size() == GPU_MAX_HW_QUEUES)) { + typedef decltype(queuePool_)::value_type::const_reference PoolRef; + auto lowest = std::min_element(queuePool_[qIndex].begin(), queuePool_[qIndex].end(), [] (PoolRef A, PoolRef B) { return A.second.refCount < B.second.refCount; }); @@ -1982,8 +2012,18 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue, return nullptr; } } - ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d, cooperative: %i", - queue, queue_size, coop_queue); + + hsa_status_t st = HSA_STATUS_SUCCESS; + st = hsa_amd_queue_set_priority(queue, queue_priority); + if (st != HSA_STATUS_SUCCESS) { + DevLogError("Device::acquireQueue: hsa_amd_queue_set_priority failed!"); + hsa_queue_destroy(queue); + return nullptr; + } + + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d with priority %d," + " cooperative: %i", queue, queue_size, queue_priority, coop_queue); + hsa_amd_profiling_set_profiler_enabled(queue, 1); if (cuMask.size() != 0) { std::stringstream ss; @@ -2008,7 +2048,7 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue, // per device. return queue; } - auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo())); + auto result = queuePool_[qIndex].emplace(std::make_pair(queue, QueueInfo())); assert(result.second && "QueueInfo already exists"); auto &qInfo = result.first->second; qInfo.refCount = 1; @@ -2016,32 +2056,45 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue, } void Device::releaseQueue(hsa_queue_t* queue) { - auto qIter = queuePool_.find(queue); - if (qIter != queuePool_.end()) { - auto &qInfo = qIter->second; - assert(qInfo.refCount > 0); - qInfo.refCount--; - if (qInfo.refCount != 0) { + for (auto& it : queuePool_) { + auto qIter = it.find(queue); + if (qIter != it.end()) { + auto &qInfo = qIter->second; + assert(qInfo.refCount > 0); + qInfo.refCount--; + if (qInfo.refCount != 0) { return; - } - ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue); + } + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, + "deleting hardware queue %p with refCount 0", queue); - if (qInfo.hostcallBuffer_) { - ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p", - qInfo.hostcallBuffer_, queue); - disableHostcalls(qInfo.hostcallBuffer_, queue); - context().svmFree(qInfo.hostcallBuffer_); - } + if (qInfo.hostcallBuffer_) { + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, + "deleting hostcall buffer %p for hardware queue %p", + qInfo.hostcallBuffer_, queue); + disableHostcalls(qInfo.hostcallBuffer_, queue); + context().svmFree(qInfo.hostcallBuffer_); + } - ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue); - queuePool_.erase(qIter); + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, + "deleting hardware queue %p with refCount 0", queue); + it.erase(qIter); + break; + } } hsa_queue_destroy(queue); } void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) { - auto qIter = queuePool_.find(queue); - assert(qIter != queuePool_.end()); + decltype(queuePool_)::value_type::iterator qIter; + for (auto& it : queuePool_) { + qIter = it.find(queue); + if (qIter != it.end()) { + break; + } + } + + assert(qIter != queuePool_[QueuePriority::High].end()); auto& qInfo = qIter->second; if (qInfo.hostcallBuffer_) { diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index ebd7b2c79d..75acf93813 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -444,7 +444,8 @@ class Device : public NullDevice { //! Acquire HSA queue. This method can create a new HSA queue or //! share previously created hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false, - const std::vector& cuMask = {}); + const std::vector& cuMask = {}, + amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal); //! Release HSA queue void releaseQueue(hsa_queue_t*); @@ -503,10 +504,16 @@ class Device : public NullDevice { int refCount; void* hostcallBuffer_; }; - std::map queuePool_; //!< Pool of HSA queues for recycling + + //!< a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling + std::vector> queuePool_; public: amd::Atomic numOfVgpus_; //!< Virtual gpu unique index + + //!< enum for keeping the total and available queue priorities + enum QueuePriority : uint { Low = 0, Normal = 1, High = 2, Total = 3}; + }; // class roc::Device } // namespace roc diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index e936468fd8..cc5dc4d3e2 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -591,7 +591,8 @@ bool VirtualGPU::releaseGpuMemoryFence() { } VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, - const std::vector& cuMask) + const std::vector& cuMask, + amd::CommandQueue::Priority priority) : device::VirtualDevice(device), state_(0), gpu_queue_(nullptr), @@ -603,7 +604,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, schedulerParam_(nullptr), schedulerQueue_(nullptr), schedulerSignal_({0}), - cuMask_(cuMask) + cuMask_(cuMask), + priority_(priority) { index_ = device.numOfVgpus_++; gpu_device_ = device.getBackendDevice(); @@ -705,7 +707,7 @@ VirtualGPU::~VirtualGPU() { bool VirtualGPU::create() { // Pick a reasonable queue size uint32_t queue_size = 1024; - gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_); + gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_, priority_); if (!gpu_queue_) return false; if (!initPool(dev().settings().kernargPoolSize_, (profiling_) ? queue_size : 0)) { diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index e14a5101d2..e3a9cfd879 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -20,6 +20,7 @@ #pragma once +#include "platform/commandqueue.hpp" #include "rocdevice.hpp" #include "utils/util.hpp" #include "hsa.h" @@ -164,7 +165,8 @@ class VirtualGPU : public device::VirtualDevice { }; VirtualGPU(Device& device, bool profiling = false, bool cooperative = false, - const std::vector& cuMask = {}); + const std::vector& cuMask = {}, + amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal); ~VirtualGPU(); bool create(); @@ -364,7 +366,9 @@ class VirtualGPU : public device::VirtualDevice { uint16_t dispatchPacketHeaderNoSync_; uint16_t dispatchPacketHeader_; - const std::vector& cuMask_; //!< The CU mask + //!< bit-vector representing the CU mask. Each active bit represents using one CU + const std::vector& cuMask_; + amd::CommandQueue::Priority priority_; //!< The priority for the hsa queue }; template diff --git a/rocclr/platform/commandqueue.hpp b/rocclr/platform/commandqueue.hpp index e051bdecd8..7def25bb73 100644 --- a/rocclr/platform/commandqueue.hpp +++ b/rocclr/platform/commandqueue.hpp @@ -47,7 +47,7 @@ class DeviceQueue; class CommandQueue : public RuntimeObject { public: static const uint RealTimeDisabled = 0xffffffff; - enum class Priority : uint { Normal = 0, Medium, High }; + enum class Priority : uint { Low = 0, Normal , Medium, High }; struct Properties { typedef cl_command_queue_properties value_type;