From 6df9ba97cea3efa4cec1fee634c45357e4b92fbb Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Wed, 28 Feb 2018 05:56:39 -0600 Subject: [PATCH] Sequence queue error callbacks with queue destroy. HSA v1.2 update. Change-Id: I13975e71b2c1ea5b7738236f5d02df84312ad00c --- runtime/hsa-runtime/core/inc/amd_aql_queue.h | 9 +- runtime/hsa-runtime/core/inc/amd_gpu_agent.h | 2 +- .../core/runtime/amd_aql_queue.cpp | 166 +++++++++--------- runtime/hsa-runtime/core/runtime/runtime.cpp | 3 + 4 files changed, 90 insertions(+), 90 deletions(-) diff --git a/runtime/hsa-runtime/core/inc/amd_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aql_queue.h index 8126f9f996..04eaa3d904 100644 --- a/runtime/hsa-runtime/core/inc/amd_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aql_queue.h @@ -204,12 +204,13 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo void CloseRingBufferFD(const char* ring_buf_shm_path, int fd) const; int CreateRingBufferFD(const char* ring_buf_shm_path, uint32_t ring_buf_phys_size_bytes) const; - static bool DynamicScratchHandler(hsa_signal_value_t error_code, void* arg); - /// @brief Define the Scratch Buffer Descriptor and related parameters /// that enable kernel access scratch memory void InitScratchSRD(); + /// @brief Handler for hardware queue events. + static bool DynamicScratchHandler(hsa_signal_value_t error_code, void* arg); + // AQL packet ring buffer void* ring_buf_; @@ -246,6 +247,10 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo uint32_t pm4_ib_size_b_; KernelMutex pm4_ib_mutex_; + // Error handler control variable. + std::atomic dynamicScratchState; + enum { ERROR_HANDLER_DONE = 1, ERROR_HANDLER_TERMINATE = 2 }; + // Shared event used for queue errors static HsaEvent* queue_event_; diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 3e36be0f52..fd5750eace 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -77,7 +77,7 @@ class GpuAgentInt : public core::Agent { : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {} // @brief Ensure blits are ready (performance hint). - virtual void PreloadBlits(){}; + virtual void PreloadBlits() {} // @brief Initialization hook invoked after tools library has loaded, // to allow tools interception of interface functions. diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 405af3df58..5ad9140db2 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -72,7 +72,7 @@ namespace amd { // Queue::amd_queue_ is cache-aligned for performance. const uint32_t kAmdQueueAlignBytes = 0x40; -HsaEvent* AqlQueue::queue_event_ = NULL; +HsaEvent* AqlQueue::queue_event_ = nullptr; std::atomic AqlQueue::queue_count_(0); KernelMutex AqlQueue::queue_lock_; int AqlQueue::rtti_id_ = 0; @@ -92,7 +92,8 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr errors_data_(err_data), is_kv_queue_(is_kv), pm4_ib_buf_(nullptr), - pm4_ib_size_b_(0x1000) { + pm4_ib_size_b_(0x1000), + dynamicScratchState(0) { // When queue_full_workaround_ is set to 1, the ring buffer is internally // doubled in size. Virtual addresses in the upper half of the ring allocation // are mapped to the same set of pages backing the lower half. @@ -279,8 +280,18 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr } AqlQueue::~AqlQueue() { - Inactivate(); + // Remove error handler synchronously. + // Sequences error handler callbacks with queue destroy. + dynamicScratchState |= ERROR_HANDLER_TERMINATE; + HSA::hsa_signal_store_screlease(amd_queue_.queue_inactive_signal, 0x8000000000000000ull); + while ((dynamicScratchState & ERROR_HANDLER_DONE) != ERROR_HANDLER_DONE) { + HSA::hsa_signal_wait_relaxed(amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE, + 0x8000000000000000ull, -1ull, HSA_WAIT_STATE_BLOCKED); + HSA::hsa_signal_store_relaxed(amd_queue_.queue_inactive_signal, 0x8000000000000000ull); + } + auto err = hsaKmtDestroyQueue(queue_id_); + assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtDestroyQueue failed."); FreeRegisteredRingBuffer(); agent_->ReleaseQueueScratch(queue_scratch_.queue_base); HSA::hsa_signal_destroy(amd_queue_.queue_inactive_signal); @@ -289,7 +300,7 @@ AqlQueue::~AqlQueue() { queue_count_--; if (queue_count_ == 0) { core::InterruptSignal::DestroyEvent(queue_event_); - queue_event_ = NULL; + queue_event_ = nullptr; } } core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_); @@ -678,107 +689,88 @@ int AqlQueue::CreateRingBufferFD(const char* ring_buf_shm_path, hsa_status_t AqlQueue::Inactivate() { bool active = active_.exchange(false, std::memory_order_relaxed); if (active) { - auto err = hsaKmtDestroyQueue(this->queue_id_); - assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtDestroyQueue failed."); + auto err = hsaKmtUpdateQueue(queue_id_, 0, HSA_QUEUE_PRIORITY_NORMAL, NULL, 0, NULL); + assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtUpdateQueue failed."); } return HSA_STATUS_SUCCESS; } bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { AqlQueue* queue = (AqlQueue*)arg; + hsa_status_t errorCode = HSA_STATUS_SUCCESS; - if ((error_code & 1) == 1) { - // Insufficient scratch - recoverable - auto& scratch = queue->queue_scratch_; + // Process errors only if queue is not terminating. + if ((queue->dynamicScratchState & ERROR_HANDLER_TERMINATE) != ERROR_HANDLER_TERMINATE) { + // Process only one queue error, don't fall through. + if (error_code == 1) { + // Insufficient scratch - recoverable, don't process dynamic scratch if errors are present. + auto& scratch = queue->queue_scratch_; - queue->agent_->ReleaseQueueScratch(scratch.queue_base); + queue->agent_->ReleaseQueueScratch(scratch.queue_base); - uint64_t pkt_slot_idx = queue->amd_queue_.read_dispatch_id % queue->amd_queue_.hsa_queue.size; + uint64_t pkt_slot_idx = queue->amd_queue_.read_dispatch_id % queue->amd_queue_.hsa_queue.size; - const core::AqlPacket& pkt = - ((core::AqlPacket*)queue->amd_queue_.hsa_queue.base_address)[pkt_slot_idx]; + const core::AqlPacket& pkt = + ((core::AqlPacket*)queue->amd_queue_.hsa_queue.base_address)[pkt_slot_idx]; - uint32_t scratch_request = pkt.dispatch.private_segment_size; + uint32_t scratch_request = pkt.dispatch.private_segment_size; - scratch.size_per_thread = - Max(uint32_t(scratch.size_per_thread * 2), scratch_request); - // Align whole waves to 1KB. - scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16); - scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) * - queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize; + scratch.size_per_thread = Max(uint32_t(scratch.size_per_thread * 2), scratch_request); + // Align whole waves to 1KB. + scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16); + scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) * + queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize; - queue->agent_->AcquireQueueScratch(scratch); - if (scratch.queue_base == NULL) { - // Out of scratch - promote error and invalidate queue - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_OUT_OF_RESOURCES, - queue->public_handle(), queue->errors_data_); - return false; + queue->agent_->AcquireQueueScratch(scratch); + + // Out of scratch - promote error + if (scratch.queue_base == NULL) errorCode = HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + // Reset scratch memory related entities for the queue + queue->InitScratchSRD(); + + } else if ((error_code & 2) == 2) { // Invalid dim + errorCode = HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; + + } else if ((error_code & 4) == 4) { // Invalid group memory + errorCode = HSA_STATUS_ERROR_INVALID_ALLOCATION; + + } else if ((error_code & 8) == 8) { // Invalid (or NULL) code + errorCode = HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + + } else if (((error_code & 32) == 32) || // Invalid format: 32 is generic, + ((error_code & 256) == 256)) { // 256 is vendor specific packets + errorCode = HSA_STATUS_ERROR_INVALID_PACKET_FORMAT; + + } else if ((error_code & 64) == 64) { // Group is too large + errorCode = HSA_STATUS_ERROR_INVALID_ARGUMENT; + + } else if ((error_code & 128) == 128) { // Out of VGPRs + errorCode = HSA_STATUS_ERROR_INVALID_ISA; + + } else if ((error_code & 0x80000000) == 0x80000000) { // Debug trap + errorCode = HSA_STATUS_ERROR_EXCEPTION; + + } else { // Undefined code + assert(false && "Undefined queue error code"); + errorCode = HSA_STATUS_ERROR; } - // Reset scratch memory related entities for the queue - queue->InitScratchSRD(); + if (errorCode == HSA_STATUS_SUCCESS) { + HSA::hsa_signal_store_relaxed(queue->amd_queue_.queue_inactive_signal, 0); + return true; + } - } else if ((error_code & 2) == 2) { // Invalid dim queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS, - queue->public_handle(), queue->errors_data_); - return false; - - } else if ((error_code & 4) == 4) { // Invalid group memory - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ALLOCATION, - queue->public_handle(), queue->errors_data_); - return false; - - } else if ((error_code & 8) == 8) { // Invalid (or NULL) code - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_INVALID_CODE_OBJECT, - queue->public_handle(), queue->errors_data_); - return false; - - } else if (((error_code & 32) == 32) || - ((error_code & 256) == 256)) { // Invalid format: 32 is generic, - // 256 is vendor specific packets - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_INVALID_PACKET_FORMAT, - queue->public_handle(), queue->errors_data_); - return false; - } else if ((error_code & 64) == 64) { // Group is too large - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ARGUMENT, - queue->public_handle(), queue->errors_data_); - return false; - } else if ((error_code & 128) == 128) { // Out of VGPRs - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ISA, - queue->public_handle(), queue->errors_data_); - return false; - } else if ((error_code & 0x80000000) == 0x80000000) { // Debug trap - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_EXCEPTION, - queue->public_handle(), queue->errors_data_); - return false; - } else { - // Undefined code - queue->Inactivate(); - assert(false && "Undefined queue error code"); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR, queue->public_handle(), - queue->errors_data_); - return false; + if (queue->errors_callback_ != nullptr) + queue->errors_callback_(errorCode, queue->public_handle(), queue->errors_data_); } - - HSA::hsa_signal_store_relaxed(queue->amd_queue_.queue_inactive_signal, 0); - return true; + // Copy here is to protect against queue being released between setting the scratch state and + // updating the signal value. + hsa_signal_t signal = queue->amd_queue_.queue_inactive_signal; + queue->dynamicScratchState = ERROR_HANDLER_DONE; + HSA::hsa_signal_store_screlease(signal, -1ull); + return false; } hsa_status_t AqlQueue::SetCUMasking(const uint32_t num_cu_mask_count, diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index 835f39205d..0625c6f5ff 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -1222,6 +1222,9 @@ void Runtime::Unload() { amd::hsa::loader::Loader::Destroy(loader_); loader_ = nullptr; + std::for_each(gpu_agents_.begin(), gpu_agents_.end(), DeleteObject()); + gpu_agents_.clear(); + async_events_control_.Shutdown(); if (vm_fault_signal_ != nullptr) {