SWDEV-517481 - Add dynamic queue management (#37)

Enabled by defaulty. DEBUG_HIP_DYNAMIC_QUEUES controls the feature
Cette révision appartient à :
Andryeyev, German
2025-03-19 11:22:50 -04:00
révisé par GitHub
Parent 392ed53c3c
révision 28967982b2
9 fichiers modifiés avec 84 ajouts et 9 suppressions
+1
Voir le fichier
@@ -1313,6 +1313,7 @@ class VirtualDevice : public amd::HeapObject {
virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
virtual void ReleaseHwQueue() {}
//! Get the blit manager object
device::BlitManager& blitMgr() const { return *blitMgr_; }
+36 -2
Voir le fichier
@@ -2876,10 +2876,21 @@ hsa_queue_t* Device::getQueueFromPool(const uint qIndex) {
return nullptr;
}
// ================================================================================================
hsa_queue_t* Device::AcquireActiveNormalQueue() {
uint32_t queue_size = ROC_AQL_QUEUE_SIZE;
auto queue = acquireQueue(
queue_size, false, std::vector<uint32_t>{}, amd::CommandQueue::Priority::Normal, true);
return queue;
}
// ================================================================================================
hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
const std::vector<uint32_t>& cuMask,
amd::CommandQueue::Priority priority) {
amd::CommandQueue::Priority priority,
bool managed) {
amd::ScopedLock l(active_queue_access_);
assert(queuePool_[QueuePriority::Low].size() <= GPU_MAX_HW_QUEUES ||
queuePool_[QueuePriority::Normal].size() <= GPU_MAX_HW_QUEUES ||
queuePool_[QueuePriority::High].size() <= GPU_MAX_HW_QUEUES);
@@ -2916,6 +2927,9 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
((queuePool_[qIndex].size() == GPU_MAX_HW_QUEUES) || queuePool_[qIndex].size() > 0)) {
hsa_queue_t* queue = getQueueFromPool(qIndex);
if (queue != nullptr) {
if (!managed && (qIndex == QueuePriority::Normal)) {
num_normal_queues_++;
}
return queue;
}
}
@@ -3057,13 +3071,33 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
qInfo.refCount = 1;
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "acquireQueue refCount: %p (%d)",
result.first->first->base_address, result.first->second.refCount);
if (!managed && (cuMask.size() == 0) && (qIndex = QueuePriority::Normal)) {
num_normal_queues_++;
}
return queue;
}
void Device::releaseQueue(hsa_queue_t* queue, const std::vector<uint32_t>& cuMask, bool coop_queue) {
// ================================================================================================
bool Device::ReleaseActiveNormalQueue(hsa_queue_t* queue) {
// Release a queue if the total number of allocated queues exceeds the max possible
if (num_normal_queues_.load() > GPU_MAX_HW_QUEUES) {
releaseQueue(queue, std::vector<uint32_t>{}, false, true);
return true;
} else {
return false;
}
}
// ================================================================================================
void Device::releaseQueue(hsa_queue_t* queue, const std::vector<uint32_t>& cuMask,
bool coop_queue, bool managed) {
amd::ScopedLock l(active_queue_access_);
for (auto& it : cuMask.size() == 0 ? queuePool_ : queueWithCUMaskPool_) {
auto qIter = it.find(queue);
if (qIter != it.end()) {
if (!managed && (cuMask.size() == 0) && (&it == &queuePool_[QueuePriority::Normal])) {
num_normal_queues_--;
}
auto &qInfo = qIter->second;
assert(qInfo.refCount > 0);
qInfo.refCount--;
+9 -2
Voir le fichier
@@ -511,10 +511,15 @@ class Device : public NullDevice {
//! share previously created
hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false,
const std::vector<uint32_t>& cuMask = {},
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal,
bool managed = false);
//! Release HSA queue
void releaseQueue(hsa_queue_t*, const std::vector<uint32_t>& cuMask = {}, bool coop_queue = false);
void releaseQueue(hsa_queue_t*, const std::vector<uint32_t>& cuMask = {},
bool coop_queue = false, bool managed = false);
hsa_queue_t* AcquireActiveNormalQueue();
bool ReleaseActiveNormalQueue(hsa_queue_t* queue);
//! For the given HSA queue, return an existing hostcall buffer or create a
//! new one. queuePool_ keeps a mapping from HSA queue to hostcall buffer.
@@ -620,6 +625,8 @@ class Device : public NullDevice {
};
//! a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling
std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queuePool_;
amd::Monitor active_queue_access_; //!< Lock to serialise virtual gpu list access
std::atomic<uint32_t> num_normal_queues_{0}; //!< The total number of allocated normal queues
//! returns a hsa queue from queuePool with least refCount and updates the refCount as well
hsa_queue_t* getQueueFromPool(const uint qIndex);
+2
Voir le fichier
@@ -95,6 +95,8 @@ Settings::Settings() {
kernel_arg_impl_ = KernelArgImpl::HostKernelArgs;
gwsInitSupported_ = true;
limit_blit_wg_ = 16;
dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false;
}
// ================================================================================================
+2 -1
Voir le fichier
@@ -50,7 +50,8 @@ class Settings : public device::Settings {
uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
uint fgs_kernel_arg_ : 1; //!< Use fine grain kernel arg segment
uint barrier_value_packet_ : 1; //!< Barrier value packet functionality
uint reserved_ : 23;
uint dynamic_queues_ : 1; //!< Dynamic queues management
uint reserved_ : 22;
};
uint value_;
};
+27 -2
Voir le fichier
@@ -883,6 +883,15 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
return true;
}
// ================================================================================================
uint64_t VirtualGPU::getQueueID() {
amd::ScopedLock lock(execution());
if (gpu_queue_ == nullptr) {
gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
}
return gpu_queue_->id;
}
// ================================================================================================
static inline void packet_store_release(uint32_t* packet, uint16_t header, uint16_t rest) {
__atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
@@ -1463,7 +1472,7 @@ VirtualGPU::~VirtualGPU() {
roc_device_.vgpus()[idx]->index_--;
}
if (gpu_queue_) {
if (gpu_queue_ != nullptr) {
roc_device_.releaseQueue(gpu_queue_, cuMask_, cooperative_);
}
}
@@ -1522,6 +1531,8 @@ bool VirtualGPU::create() {
LogError("Could not create managed buffer for this queue!");
return false;
}
// Release HW queue until the first usage
ReleaseHwQueue();
return true;
}
@@ -1607,7 +1618,6 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) {
return result;
}
// ================================================================================================
void VirtualGPU::ManagedBuffer::ResetPool() {
pool_cur_offset_ = 0;
@@ -1631,12 +1641,27 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
}
}
// ================================================================================================
void VirtualGPU::ReleaseHwQueue() {
// Try to release normal queue to the pool of active queues
if (roc_device_.settings().dynamic_queues_ &&
(priority_ == amd::CommandQueue::Priority::Normal)) {
amd::ScopedLock lock(execution());
if ((gpu_queue_ != nullptr) && roc_device_.ReleaseActiveNormalQueue(gpu_queue_)) {
gpu_queue_ = nullptr;
}
}
}
// ================================================================================================
/* profilingBegin, when profiling is enabled, creates a timestamp to save in
* virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
* and then calls start() to get the current host timestamp.
*/
void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
if (gpu_queue_ == nullptr) {
gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
}
// Track the current command
command_ = &command;
+3 -2
Voir le fichier
@@ -376,6 +376,7 @@ class VirtualGPU : public device::VirtualDevice {
virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd){}
virtual address allocKernelArguments(size_t size, size_t alignment) final;
virtual void ReleaseHwQueue() final;
/**
* @brief Waits on an outstanding kernel without regard to how
@@ -436,7 +437,7 @@ class VirtualGPU : public device::VirtualDevice {
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
uint64_t getQueueID() { return gpu_queue_->id; }
uint64_t getQueueID();
//! Analyzes a crashed AQL queue to find a broken AQL packet
void AnalyzeAqlQueue() const;
@@ -531,7 +532,7 @@ class VirtualGPU : public device::VirtualDevice {
Timestamp* timestamp_;
amd::Command* command_; //!< Current command
hsa_agent_t gpu_device_; //!< Physical device
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
hsa_queue_t* gpu_queue_; //!< Active queue associated with a vgpu
hsa_barrier_and_packet_t barrier_packet_;
hsa_amd_barrier_value_packet_t barrier_value_packet_;
+2
Voir le fichier
@@ -197,6 +197,8 @@ void HostQueue::finish(bool cpu_wait) {
}
}
}
// Release HW queue to the pool for dynamic management if enabled
vdev()->ReleaseHwQueue();
command->release();
ClPrint(LOG_DEBUG, LOG_CMD, "All commands finished for host queue : %p", this);
+2
Voir le fichier
@@ -273,6 +273,8 @@ release(bool, DEBUG_CLR_KERNARG_HDP_FLUSH_WA, false, \
"Toggle kernel arg copy workaround") \
release(bool, DEBUG_CLR_SKIP_RELEASE_SCOPE, false, \
"Forces release scope to SCOPE_NONE for aql packets") \
release(bool, DEBUG_HIP_DYNAMIC_QUEUES, true, \
"Forces dynamic queue management") \
release(uint, DEBUG_HIP_7_PREVIEW, 0, \
"Enables specific backward incompatible changes support before 7.0," \
"using the mask. By default the changes are disabled and is set to 0")\