Add support for setting queue priority for ROCm backend

Change-Id: I67ed5a6868af79538f7f4522d8d11c043cdf3c1e
This commit is contained in:
Aryan Salmanpour
2020-06-04 01:16:31 -04:00
committad av Aryan Salmanpour
förälder 44bc0cb35d
incheckning b5552aa97f
5 ändrade filer med 105 tillägg och 39 borttagningar
+84 -31
Visa fil
@@ -171,7 +171,8 @@ Device::Device(hsa_agent_t bkendDevice)
, freeMem_(0)
, vgpusAccess_("Virtual GPU List Ops Lock", true)
, hsa_exclusive_gpu_access_(false)
, numOfVgpus_(0) {
, numOfVgpus_(0)
, queuePool_(QueuePriority::Total) {
group_segment_.handle = 0;
system_segment_.handle = 0;
system_coarse_segment_.handle = 0;
@@ -1503,8 +1504,11 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
// Initialization of heap and other resources occur during the command
// queue creation time.
const std::vector<uint32_t> defaultCuMask = {};
bool q = (queue != nullptr);
VirtualGPU* virtualDevice = new VirtualGPU(*this, profiling, cooperative,
(queue != nullptr) ? queue->cuMask() : defaultCuMask);
q ? queue->cuMask() : defaultCuMask,
q ? queue->priority()
: amd::CommandQueue::Priority::Normal);
if (!virtualDevice->create()) {
delete virtualDevice;
@@ -1935,17 +1939,43 @@ static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
}
hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
const std::vector<uint32_t>& cuMask) {
assert(queuePool_.size() <= GPU_MAX_HW_QUEUES);
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d",
queuePool_.size(), GPU_MAX_HW_QUEUES);
const std::vector<uint32_t>& cuMask,
amd::CommandQueue::Priority priority) {
assert(queuePool_[QueuePriority::Low].size() <= GPU_MAX_HW_QUEUES ||
queuePool_[QueuePriority::Normal].size() <= GPU_MAX_HW_QUEUES ||
queuePool_[QueuePriority::High].size() <= GPU_MAX_HW_QUEUES);
// If we have reached the max number of queues, reuse an existing queue,
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues with low priority: %d,"
" with normal priority: %d, with high priority: %d, maximum per priority is: %d",
queuePool_[QueuePriority::Low].size(),
queuePool_[QueuePriority::Normal].size(),
queuePool_[QueuePriority::High].size(), GPU_MAX_HW_QUEUES);
hsa_amd_queue_priority_t queue_priority;
uint qIndex;
switch (priority) {
case amd::CommandQueue::Priority::Low:
queue_priority = HSA_AMD_QUEUE_PRIORITY_LOW;
qIndex = QueuePriority::Low;
break;
case amd::CommandQueue::Priority::High:
queue_priority = HSA_AMD_QUEUE_PRIORITY_HIGH;
qIndex = QueuePriority::High;
break;
case amd::CommandQueue::Priority::Normal:
case amd::CommandQueue::Priority::Medium:
default:
queue_priority = HSA_AMD_QUEUE_PRIORITY_NORMAL;
qIndex = QueuePriority::Normal;
break;
}
// If we have reached the max number of queues, reuse an existing queue with the matching queue priority,
// choosing the one with the least number of users.
// Note: Don't attempt to reuse the cooperative queue, since it's single per device
if (!coop_queue && (cuMask.size() == 0) && (queuePool_.size() == GPU_MAX_HW_QUEUES)) {
typedef decltype(queuePool_)::const_reference PoolRef;
auto lowest = std::min_element(queuePool_.begin(), queuePool_.end(),
if (!coop_queue && (cuMask.size() == 0) && (queuePool_[qIndex].size() == GPU_MAX_HW_QUEUES)) {
typedef decltype(queuePool_)::value_type::const_reference PoolRef;
auto lowest = std::min_element(queuePool_[qIndex].begin(), queuePool_[qIndex].end(),
[] (PoolRef A, PoolRef B) {
return A.second.refCount < B.second.refCount;
});
@@ -1982,8 +2012,18 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
return nullptr;
}
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d, cooperative: %i",
queue, queue_size, coop_queue);
hsa_status_t st = HSA_STATUS_SUCCESS;
st = hsa_amd_queue_set_priority(queue, queue_priority);
if (st != HSA_STATUS_SUCCESS) {
DevLogError("Device::acquireQueue: hsa_amd_queue_set_priority failed!");
hsa_queue_destroy(queue);
return nullptr;
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d with priority %d,"
" cooperative: %i", queue, queue_size, queue_priority, coop_queue);
hsa_amd_profiling_set_profiler_enabled(queue, 1);
if (cuMask.size() != 0) {
std::stringstream ss;
@@ -2008,7 +2048,7 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
// per device.
return queue;
}
auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo()));
auto result = queuePool_[qIndex].emplace(std::make_pair(queue, QueueInfo()));
assert(result.second && "QueueInfo already exists");
auto &qInfo = result.first->second;
qInfo.refCount = 1;
@@ -2016,32 +2056,45 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
}
void Device::releaseQueue(hsa_queue_t* queue) {
auto qIter = queuePool_.find(queue);
if (qIter != queuePool_.end()) {
auto &qInfo = qIter->second;
assert(qInfo.refCount > 0);
qInfo.refCount--;
if (qInfo.refCount != 0) {
for (auto& it : queuePool_) {
auto qIter = it.find(queue);
if (qIter != it.end()) {
auto &qInfo = qIter->second;
assert(qInfo.refCount > 0);
qInfo.refCount--;
if (qInfo.refCount != 0) {
return;
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
"deleting hardware queue %p with refCount 0", queue);
if (qInfo.hostcallBuffer_) {
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
qInfo.hostcallBuffer_, queue);
disableHostcalls(qInfo.hostcallBuffer_, queue);
context().svmFree(qInfo.hostcallBuffer_);
}
if (qInfo.hostcallBuffer_) {
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
"deleting hostcall buffer %p for hardware queue %p",
qInfo.hostcallBuffer_, queue);
disableHostcalls(qInfo.hostcallBuffer_, queue);
context().svmFree(qInfo.hostcallBuffer_);
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
queuePool_.erase(qIter);
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
"deleting hardware queue %p with refCount 0", queue);
it.erase(qIter);
break;
}
}
hsa_queue_destroy(queue);
}
void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) {
auto qIter = queuePool_.find(queue);
assert(qIter != queuePool_.end());
decltype(queuePool_)::value_type::iterator qIter;
for (auto& it : queuePool_) {
qIter = it.find(queue);
if (qIter != it.end()) {
break;
}
}
assert(qIter != queuePool_[QueuePriority::High].end());
auto& qInfo = qIter->second;
if (qInfo.hostcallBuffer_) {
+9 -2
Visa fil
@@ -444,7 +444,8 @@ class Device : public NullDevice {
//! Acquire HSA queue. This method can create a new HSA queue or
//! share previously created
hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false,
const std::vector<uint32_t>& cuMask = {});
const std::vector<uint32_t>& cuMask = {},
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
//! Release HSA queue
void releaseQueue(hsa_queue_t*);
@@ -503,10 +504,16 @@ class Device : public NullDevice {
int refCount;
void* hostcallBuffer_;
};
std::map<hsa_queue_t*, QueueInfo> queuePool_; //!< Pool of HSA queues for recycling
//!< a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling
std::vector<std::map<hsa_queue_t*, QueueInfo>> queuePool_;
public:
amd::Atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
//!< enum for keeping the total and available queue priorities
enum QueuePriority : uint { Low = 0, Normal = 1, High = 2, Total = 3};
}; // class roc::Device
} // namespace roc
+5 -3
Visa fil
@@ -591,7 +591,8 @@ bool VirtualGPU::releaseGpuMemoryFence() {
}
VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
const std::vector<uint32_t>& cuMask)
const std::vector<uint32_t>& cuMask,
amd::CommandQueue::Priority priority)
: device::VirtualDevice(device),
state_(0),
gpu_queue_(nullptr),
@@ -603,7 +604,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
schedulerParam_(nullptr),
schedulerQueue_(nullptr),
schedulerSignal_({0}),
cuMask_(cuMask)
cuMask_(cuMask),
priority_(priority)
{
index_ = device.numOfVgpus_++;
gpu_device_ = device.getBackendDevice();
@@ -705,7 +707,7 @@ VirtualGPU::~VirtualGPU() {
bool VirtualGPU::create() {
// Pick a reasonable queue size
uint32_t queue_size = 1024;
gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_);
gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_, priority_);
if (!gpu_queue_) return false;
if (!initPool(dev().settings().kernargPoolSize_, (profiling_) ? queue_size : 0)) {
+6 -2
Visa fil
@@ -20,6 +20,7 @@
#pragma once
#include "platform/commandqueue.hpp"
#include "rocdevice.hpp"
#include "utils/util.hpp"
#include "hsa.h"
@@ -164,7 +165,8 @@ class VirtualGPU : public device::VirtualDevice {
};
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
const std::vector<uint32_t>& cuMask = {});
const std::vector<uint32_t>& cuMask = {},
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
~VirtualGPU();
bool create();
@@ -364,7 +366,9 @@ class VirtualGPU : public device::VirtualDevice {
uint16_t dispatchPacketHeaderNoSync_;
uint16_t dispatchPacketHeader_;
const std::vector<uint32_t>& cuMask_; //!< The CU mask
//!< bit-vector representing the CU mask. Each active bit represents using one CU
const std::vector<uint32_t>& cuMask_;
amd::CommandQueue::Priority priority_; //!< The priority for the hsa queue
};
template <typename T>
+1 -1
Visa fil
@@ -47,7 +47,7 @@ class DeviceQueue;
class CommandQueue : public RuntimeObject {
public:
static const uint RealTimeDisabled = 0xffffffff;
enum class Priority : uint { Normal = 0, Medium, High };
enum class Priority : uint { Low = 0, Normal , Medium, High };
struct Properties {
typedef cl_command_queue_properties value_type;