Add support for setting queue priority for ROCm backend
Change-Id: I67ed5a6868af79538f7f4522d8d11c043cdf3c1e
This commit is contained in:
committad av
Aryan Salmanpour
förälder
44bc0cb35d
incheckning
b5552aa97f
@@ -171,7 +171,8 @@ Device::Device(hsa_agent_t bkendDevice)
|
||||
, freeMem_(0)
|
||||
, vgpusAccess_("Virtual GPU List Ops Lock", true)
|
||||
, hsa_exclusive_gpu_access_(false)
|
||||
, numOfVgpus_(0) {
|
||||
, numOfVgpus_(0)
|
||||
, queuePool_(QueuePriority::Total) {
|
||||
group_segment_.handle = 0;
|
||||
system_segment_.handle = 0;
|
||||
system_coarse_segment_.handle = 0;
|
||||
@@ -1503,8 +1504,11 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
|
||||
// Initialization of heap and other resources occur during the command
|
||||
// queue creation time.
|
||||
const std::vector<uint32_t> defaultCuMask = {};
|
||||
bool q = (queue != nullptr);
|
||||
VirtualGPU* virtualDevice = new VirtualGPU(*this, profiling, cooperative,
|
||||
(queue != nullptr) ? queue->cuMask() : defaultCuMask);
|
||||
q ? queue->cuMask() : defaultCuMask,
|
||||
q ? queue->priority()
|
||||
: amd::CommandQueue::Priority::Normal);
|
||||
|
||||
if (!virtualDevice->create()) {
|
||||
delete virtualDevice;
|
||||
@@ -1935,17 +1939,43 @@ static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
|
||||
}
|
||||
|
||||
hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
|
||||
const std::vector<uint32_t>& cuMask) {
|
||||
assert(queuePool_.size() <= GPU_MAX_HW_QUEUES);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d",
|
||||
queuePool_.size(), GPU_MAX_HW_QUEUES);
|
||||
const std::vector<uint32_t>& cuMask,
|
||||
amd::CommandQueue::Priority priority) {
|
||||
assert(queuePool_[QueuePriority::Low].size() <= GPU_MAX_HW_QUEUES ||
|
||||
queuePool_[QueuePriority::Normal].size() <= GPU_MAX_HW_QUEUES ||
|
||||
queuePool_[QueuePriority::High].size() <= GPU_MAX_HW_QUEUES);
|
||||
|
||||
// If we have reached the max number of queues, reuse an existing queue,
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues with low priority: %d,"
|
||||
" with normal priority: %d, with high priority: %d, maximum per priority is: %d",
|
||||
queuePool_[QueuePriority::Low].size(),
|
||||
queuePool_[QueuePriority::Normal].size(),
|
||||
queuePool_[QueuePriority::High].size(), GPU_MAX_HW_QUEUES);
|
||||
|
||||
hsa_amd_queue_priority_t queue_priority;
|
||||
uint qIndex;
|
||||
switch (priority) {
|
||||
case amd::CommandQueue::Priority::Low:
|
||||
queue_priority = HSA_AMD_QUEUE_PRIORITY_LOW;
|
||||
qIndex = QueuePriority::Low;
|
||||
break;
|
||||
case amd::CommandQueue::Priority::High:
|
||||
queue_priority = HSA_AMD_QUEUE_PRIORITY_HIGH;
|
||||
qIndex = QueuePriority::High;
|
||||
break;
|
||||
case amd::CommandQueue::Priority::Normal:
|
||||
case amd::CommandQueue::Priority::Medium:
|
||||
default:
|
||||
queue_priority = HSA_AMD_QUEUE_PRIORITY_NORMAL;
|
||||
qIndex = QueuePriority::Normal;
|
||||
break;
|
||||
}
|
||||
|
||||
// If we have reached the max number of queues, reuse an existing queue with the matching queue priority,
|
||||
// choosing the one with the least number of users.
|
||||
// Note: Don't attempt to reuse the cooperative queue, since it's single per device
|
||||
if (!coop_queue && (cuMask.size() == 0) && (queuePool_.size() == GPU_MAX_HW_QUEUES)) {
|
||||
typedef decltype(queuePool_)::const_reference PoolRef;
|
||||
auto lowest = std::min_element(queuePool_.begin(), queuePool_.end(),
|
||||
if (!coop_queue && (cuMask.size() == 0) && (queuePool_[qIndex].size() == GPU_MAX_HW_QUEUES)) {
|
||||
typedef decltype(queuePool_)::value_type::const_reference PoolRef;
|
||||
auto lowest = std::min_element(queuePool_[qIndex].begin(), queuePool_[qIndex].end(),
|
||||
[] (PoolRef A, PoolRef B) {
|
||||
return A.second.refCount < B.second.refCount;
|
||||
});
|
||||
@@ -1982,8 +2012,18 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d, cooperative: %i",
|
||||
queue, queue_size, coop_queue);
|
||||
|
||||
hsa_status_t st = HSA_STATUS_SUCCESS;
|
||||
st = hsa_amd_queue_set_priority(queue, queue_priority);
|
||||
if (st != HSA_STATUS_SUCCESS) {
|
||||
DevLogError("Device::acquireQueue: hsa_amd_queue_set_priority failed!");
|
||||
hsa_queue_destroy(queue);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d with priority %d,"
|
||||
" cooperative: %i", queue, queue_size, queue_priority, coop_queue);
|
||||
|
||||
hsa_amd_profiling_set_profiler_enabled(queue, 1);
|
||||
if (cuMask.size() != 0) {
|
||||
std::stringstream ss;
|
||||
@@ -2008,7 +2048,7 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
|
||||
// per device.
|
||||
return queue;
|
||||
}
|
||||
auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo()));
|
||||
auto result = queuePool_[qIndex].emplace(std::make_pair(queue, QueueInfo()));
|
||||
assert(result.second && "QueueInfo already exists");
|
||||
auto &qInfo = result.first->second;
|
||||
qInfo.refCount = 1;
|
||||
@@ -2016,32 +2056,45 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
|
||||
}
|
||||
|
||||
void Device::releaseQueue(hsa_queue_t* queue) {
|
||||
auto qIter = queuePool_.find(queue);
|
||||
if (qIter != queuePool_.end()) {
|
||||
auto &qInfo = qIter->second;
|
||||
assert(qInfo.refCount > 0);
|
||||
qInfo.refCount--;
|
||||
if (qInfo.refCount != 0) {
|
||||
for (auto& it : queuePool_) {
|
||||
auto qIter = it.find(queue);
|
||||
if (qIter != it.end()) {
|
||||
auto &qInfo = qIter->second;
|
||||
assert(qInfo.refCount > 0);
|
||||
qInfo.refCount--;
|
||||
if (qInfo.refCount != 0) {
|
||||
return;
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
|
||||
"deleting hardware queue %p with refCount 0", queue);
|
||||
|
||||
if (qInfo.hostcallBuffer_) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
|
||||
qInfo.hostcallBuffer_, queue);
|
||||
disableHostcalls(qInfo.hostcallBuffer_, queue);
|
||||
context().svmFree(qInfo.hostcallBuffer_);
|
||||
}
|
||||
if (qInfo.hostcallBuffer_) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
|
||||
"deleting hostcall buffer %p for hardware queue %p",
|
||||
qInfo.hostcallBuffer_, queue);
|
||||
disableHostcalls(qInfo.hostcallBuffer_, queue);
|
||||
context().svmFree(qInfo.hostcallBuffer_);
|
||||
}
|
||||
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
|
||||
queuePool_.erase(qIter);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
|
||||
"deleting hardware queue %p with refCount 0", queue);
|
||||
it.erase(qIter);
|
||||
break;
|
||||
}
|
||||
}
|
||||
hsa_queue_destroy(queue);
|
||||
}
|
||||
|
||||
void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) {
|
||||
auto qIter = queuePool_.find(queue);
|
||||
assert(qIter != queuePool_.end());
|
||||
decltype(queuePool_)::value_type::iterator qIter;
|
||||
for (auto& it : queuePool_) {
|
||||
qIter = it.find(queue);
|
||||
if (qIter != it.end()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert(qIter != queuePool_[QueuePriority::High].end());
|
||||
|
||||
auto& qInfo = qIter->second;
|
||||
if (qInfo.hostcallBuffer_) {
|
||||
|
||||
@@ -444,7 +444,8 @@ class Device : public NullDevice {
|
||||
//! Acquire HSA queue. This method can create a new HSA queue or
|
||||
//! share previously created
|
||||
hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false,
|
||||
const std::vector<uint32_t>& cuMask = {});
|
||||
const std::vector<uint32_t>& cuMask = {},
|
||||
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
|
||||
|
||||
//! Release HSA queue
|
||||
void releaseQueue(hsa_queue_t*);
|
||||
@@ -503,10 +504,16 @@ class Device : public NullDevice {
|
||||
int refCount;
|
||||
void* hostcallBuffer_;
|
||||
};
|
||||
std::map<hsa_queue_t*, QueueInfo> queuePool_; //!< Pool of HSA queues for recycling
|
||||
|
||||
//!< a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling
|
||||
std::vector<std::map<hsa_queue_t*, QueueInfo>> queuePool_;
|
||||
|
||||
public:
|
||||
amd::Atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
|
||||
|
||||
//!< enum for keeping the total and available queue priorities
|
||||
enum QueuePriority : uint { Low = 0, Normal = 1, High = 2, Total = 3};
|
||||
|
||||
}; // class roc::Device
|
||||
} // namespace roc
|
||||
|
||||
|
||||
@@ -591,7 +591,8 @@ bool VirtualGPU::releaseGpuMemoryFence() {
|
||||
}
|
||||
|
||||
VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
const std::vector<uint32_t>& cuMask)
|
||||
const std::vector<uint32_t>& cuMask,
|
||||
amd::CommandQueue::Priority priority)
|
||||
: device::VirtualDevice(device),
|
||||
state_(0),
|
||||
gpu_queue_(nullptr),
|
||||
@@ -603,7 +604,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
schedulerParam_(nullptr),
|
||||
schedulerQueue_(nullptr),
|
||||
schedulerSignal_({0}),
|
||||
cuMask_(cuMask)
|
||||
cuMask_(cuMask),
|
||||
priority_(priority)
|
||||
{
|
||||
index_ = device.numOfVgpus_++;
|
||||
gpu_device_ = device.getBackendDevice();
|
||||
@@ -705,7 +707,7 @@ VirtualGPU::~VirtualGPU() {
|
||||
bool VirtualGPU::create() {
|
||||
// Pick a reasonable queue size
|
||||
uint32_t queue_size = 1024;
|
||||
gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_);
|
||||
gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_, priority_);
|
||||
if (!gpu_queue_) return false;
|
||||
|
||||
if (!initPool(dev().settings().kernargPoolSize_, (profiling_) ? queue_size : 0)) {
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "platform/commandqueue.hpp"
|
||||
#include "rocdevice.hpp"
|
||||
#include "utils/util.hpp"
|
||||
#include "hsa.h"
|
||||
@@ -164,7 +165,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
};
|
||||
|
||||
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
|
||||
const std::vector<uint32_t>& cuMask = {});
|
||||
const std::vector<uint32_t>& cuMask = {},
|
||||
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
|
||||
~VirtualGPU();
|
||||
|
||||
bool create();
|
||||
@@ -364,7 +366,9 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
uint16_t dispatchPacketHeaderNoSync_;
|
||||
uint16_t dispatchPacketHeader_;
|
||||
|
||||
const std::vector<uint32_t>& cuMask_; //!< The CU mask
|
||||
//!< bit-vector representing the CU mask. Each active bit represents using one CU
|
||||
const std::vector<uint32_t>& cuMask_;
|
||||
amd::CommandQueue::Priority priority_; //!< The priority for the hsa queue
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
|
||||
@@ -47,7 +47,7 @@ class DeviceQueue;
|
||||
class CommandQueue : public RuntimeObject {
|
||||
public:
|
||||
static const uint RealTimeDisabled = 0xffffffff;
|
||||
enum class Priority : uint { Normal = 0, Medium, High };
|
||||
enum class Priority : uint { Low = 0, Normal , Medium, High };
|
||||
|
||||
struct Properties {
|
||||
typedef cl_command_queue_properties value_type;
|
||||
|
||||
Referens i nytt ärende
Block a user