SWDEV-517481 - Add dynamic queue management (#37)
Enabled by defaulty. DEBUG_HIP_DYNAMIC_QUEUES controls the feature
[ROCm/clr commit: 28967982b2]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
00e77db856
Коммит
5c7c86f66d
@@ -1313,6 +1313,7 @@ class VirtualDevice : public amd::HeapObject {
|
||||
virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
|
||||
|
||||
virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
|
||||
virtual void ReleaseHwQueue() {}
|
||||
|
||||
//! Get the blit manager object
|
||||
device::BlitManager& blitMgr() const { return *blitMgr_; }
|
||||
|
||||
@@ -2876,10 +2876,21 @@ hsa_queue_t* Device::getQueueFromPool(const uint qIndex) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hsa_queue_t* Device::AcquireActiveNormalQueue() {
|
||||
uint32_t queue_size = ROC_AQL_QUEUE_SIZE;
|
||||
auto queue = acquireQueue(
|
||||
queue_size, false, std::vector<uint32_t>{}, amd::CommandQueue::Priority::Normal, true);
|
||||
return queue;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
|
||||
const std::vector<uint32_t>& cuMask,
|
||||
amd::CommandQueue::Priority priority) {
|
||||
amd::CommandQueue::Priority priority,
|
||||
bool managed) {
|
||||
amd::ScopedLock l(active_queue_access_);
|
||||
|
||||
assert(queuePool_[QueuePriority::Low].size() <= GPU_MAX_HW_QUEUES ||
|
||||
queuePool_[QueuePriority::Normal].size() <= GPU_MAX_HW_QUEUES ||
|
||||
queuePool_[QueuePriority::High].size() <= GPU_MAX_HW_QUEUES);
|
||||
@@ -2916,6 +2927,9 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
|
||||
((queuePool_[qIndex].size() == GPU_MAX_HW_QUEUES) || queuePool_[qIndex].size() > 0)) {
|
||||
hsa_queue_t* queue = getQueueFromPool(qIndex);
|
||||
if (queue != nullptr) {
|
||||
if (!managed && (qIndex == QueuePriority::Normal)) {
|
||||
num_normal_queues_++;
|
||||
}
|
||||
return queue;
|
||||
}
|
||||
}
|
||||
@@ -3057,13 +3071,33 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
|
||||
qInfo.refCount = 1;
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "acquireQueue refCount: %p (%d)",
|
||||
result.first->first->base_address, result.first->second.refCount);
|
||||
if (!managed && (cuMask.size() == 0) && (qIndex = QueuePriority::Normal)) {
|
||||
num_normal_queues_++;
|
||||
}
|
||||
return queue;
|
||||
}
|
||||
|
||||
void Device::releaseQueue(hsa_queue_t* queue, const std::vector<uint32_t>& cuMask, bool coop_queue) {
|
||||
// ================================================================================================
|
||||
bool Device::ReleaseActiveNormalQueue(hsa_queue_t* queue) {
|
||||
// Release a queue if the total number of allocated queues exceeds the max possible
|
||||
if (num_normal_queues_.load() > GPU_MAX_HW_QUEUES) {
|
||||
releaseQueue(queue, std::vector<uint32_t>{}, false, true);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Device::releaseQueue(hsa_queue_t* queue, const std::vector<uint32_t>& cuMask,
|
||||
bool coop_queue, bool managed) {
|
||||
amd::ScopedLock l(active_queue_access_);
|
||||
for (auto& it : cuMask.size() == 0 ? queuePool_ : queueWithCUMaskPool_) {
|
||||
auto qIter = it.find(queue);
|
||||
if (qIter != it.end()) {
|
||||
if (!managed && (cuMask.size() == 0) && (&it == &queuePool_[QueuePriority::Normal])) {
|
||||
num_normal_queues_--;
|
||||
}
|
||||
auto &qInfo = qIter->second;
|
||||
assert(qInfo.refCount > 0);
|
||||
qInfo.refCount--;
|
||||
|
||||
@@ -511,10 +511,15 @@ class Device : public NullDevice {
|
||||
//! share previously created
|
||||
hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false,
|
||||
const std::vector<uint32_t>& cuMask = {},
|
||||
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
|
||||
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal,
|
||||
bool managed = false);
|
||||
|
||||
//! Release HSA queue
|
||||
void releaseQueue(hsa_queue_t*, const std::vector<uint32_t>& cuMask = {}, bool coop_queue = false);
|
||||
void releaseQueue(hsa_queue_t*, const std::vector<uint32_t>& cuMask = {},
|
||||
bool coop_queue = false, bool managed = false);
|
||||
|
||||
hsa_queue_t* AcquireActiveNormalQueue();
|
||||
bool ReleaseActiveNormalQueue(hsa_queue_t* queue);
|
||||
|
||||
//! For the given HSA queue, return an existing hostcall buffer or create a
|
||||
//! new one. queuePool_ keeps a mapping from HSA queue to hostcall buffer.
|
||||
@@ -620,6 +625,8 @@ class Device : public NullDevice {
|
||||
};
|
||||
//! a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling
|
||||
std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queuePool_;
|
||||
amd::Monitor active_queue_access_; //!< Lock to serialise virtual gpu list access
|
||||
std::atomic<uint32_t> num_normal_queues_{0}; //!< The total number of allocated normal queues
|
||||
|
||||
//! returns a hsa queue from queuePool with least refCount and updates the refCount as well
|
||||
hsa_queue_t* getQueueFromPool(const uint qIndex);
|
||||
|
||||
@@ -95,6 +95,8 @@ Settings::Settings() {
|
||||
kernel_arg_impl_ = KernelArgImpl::HostKernelArgs;
|
||||
gwsInitSupported_ = true;
|
||||
limit_blit_wg_ = 16;
|
||||
|
||||
dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
|
||||
@@ -50,7 +50,8 @@ class Settings : public device::Settings {
|
||||
uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
|
||||
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
|
||||
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
|
||||
uint reserved_ : 23;
|
||||
uint dynamic_queues_ : 1; //!< Dynamic queues management
|
||||
uint reserved_ : 22;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
@@ -883,6 +883,15 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
uint64_t VirtualGPU::getQueueID() {
|
||||
amd::ScopedLock lock(execution());
|
||||
if (gpu_queue_ == nullptr) {
|
||||
gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
|
||||
}
|
||||
return gpu_queue_->id;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
static inline void packet_store_release(uint32_t* packet, uint16_t header, uint16_t rest) {
|
||||
__atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
|
||||
@@ -1463,7 +1472,7 @@ VirtualGPU::~VirtualGPU() {
|
||||
roc_device_.vgpus()[idx]->index_--;
|
||||
}
|
||||
|
||||
if (gpu_queue_) {
|
||||
if (gpu_queue_ != nullptr) {
|
||||
roc_device_.releaseQueue(gpu_queue_, cuMask_, cooperative_);
|
||||
}
|
||||
}
|
||||
@@ -1522,6 +1531,8 @@ bool VirtualGPU::create() {
|
||||
LogError("Could not create managed buffer for this queue!");
|
||||
return false;
|
||||
}
|
||||
// Release HW queue until the first usage
|
||||
ReleaseHwQueue();
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1607,7 +1618,6 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) {
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::ManagedBuffer::ResetPool() {
|
||||
pool_cur_offset_ = 0;
|
||||
@@ -1631,12 +1641,27 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::ReleaseHwQueue() {
|
||||
// Try to release normal queue to the pool of active queues
|
||||
if (roc_device_.settings().dynamic_queues_ &&
|
||||
(priority_ == amd::CommandQueue::Priority::Normal)) {
|
||||
amd::ScopedLock lock(execution());
|
||||
if ((gpu_queue_ != nullptr) && roc_device_.ReleaseActiveNormalQueue(gpu_queue_)) {
|
||||
gpu_queue_ = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
/* profilingBegin, when profiling is enabled, creates a timestamp to save in
|
||||
* virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
|
||||
* and then calls start() to get the current host timestamp.
|
||||
*/
|
||||
void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
|
||||
if (gpu_queue_ == nullptr) {
|
||||
gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
|
||||
}
|
||||
// Track the current command
|
||||
command_ = &command;
|
||||
|
||||
|
||||
@@ -376,6 +376,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd){}
|
||||
|
||||
virtual address allocKernelArguments(size_t size, size_t alignment) final;
|
||||
virtual void ReleaseHwQueue() final;
|
||||
|
||||
/**
|
||||
* @brief Waits on an outstanding kernel without regard to how
|
||||
@@ -436,7 +437,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
|
||||
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
|
||||
uint64_t getQueueID() { return gpu_queue_->id; }
|
||||
uint64_t getQueueID();
|
||||
|
||||
//! Analyzes a crashed AQL queue to find a broken AQL packet
|
||||
void AnalyzeAqlQueue() const;
|
||||
@@ -531,7 +532,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
Timestamp* timestamp_;
|
||||
amd::Command* command_; //!< Current command
|
||||
hsa_agent_t gpu_device_; //!< Physical device
|
||||
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
|
||||
hsa_queue_t* gpu_queue_; //!< Active queue associated with a vgpu
|
||||
hsa_barrier_and_packet_t barrier_packet_;
|
||||
hsa_amd_barrier_value_packet_t barrier_value_packet_;
|
||||
|
||||
|
||||
@@ -197,6 +197,8 @@ void HostQueue::finish(bool cpu_wait) {
|
||||
}
|
||||
}
|
||||
}
|
||||
// Release HW queue to the pool for dynamic management if enabled
|
||||
vdev()->ReleaseHwQueue();
|
||||
|
||||
command->release();
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "All commands finished for host queue : %p", this);
|
||||
|
||||
@@ -273,6 +273,8 @@ release(bool, DEBUG_CLR_KERNARG_HDP_FLUSH_WA, false, \
|
||||
"Toggle kernel arg copy workaround") \
|
||||
release(bool, DEBUG_CLR_SKIP_RELEASE_SCOPE, false, \
|
||||
"Forces release scope to SCOPE_NONE for aql packets") \
|
||||
release(bool, DEBUG_HIP_DYNAMIC_QUEUES, true, \
|
||||
"Forces dynamic queue management") \
|
||||
release(uint, DEBUG_HIP_7_PREVIEW, 0, \
|
||||
"Enables specific backward incompatible changes support before 7.0," \
|
||||
"using the mask. By default the changes are disabled and is set to 0")\
|
||||
|
||||
Ссылка в новой задаче
Block a user