From c5faafeb25976c62359ee2ccc1653a447689cd39 Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Thu, 10 Jul 2025 21:03:57 -0500 Subject: [PATCH] rocr: Ensure AqlQueue can exit on memory error A hang would occur when a memory error occurs because the AQLQueue destructor would be waiting for a signal that wouldn't come. This change allows it to break out of the wait loop. [ROCm/ROCR-Runtime commit: c065d9a7e2141270d5933c9ca34c50b5f22d5253] --- .../hsa-runtime/core/runtime/amd_aql_queue.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index 52635b7c79..3e39bfa7db 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -1291,12 +1291,15 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) { AqlQueue* queue = (AqlQueue*)arg; hsa_status_t errorCode = HSA_STATUS_ERROR; - - if (queue->exceptionState == ERROR_HANDLER_TERMINATE) { + auto exceptionHandlerDone = [&]() { Signal* signal = queue->exception_signal_; queue->exceptionState = ERROR_HANDLER_DONE; signal->StoreRelease(0); return false; + }; + + if (queue->exceptionState == ERROR_HANDLER_TERMINATE) { + return exceptionHandlerDone(); } for (auto& error : QueueErrors) { @@ -1313,7 +1316,7 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) { // handler. if (errorCode == static_cast(HSA_STATUS_ERROR_MEMORY_FAULT)) { debug_print("Queue error - HSA_STATUS_ERROR_MEMORY_FAULT\n"); - return false; + return exceptionHandlerDone(); } // Fallback if KFD does not support GPU core dump. In this case, there core dump is @@ -1335,10 +1338,7 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) { if (queue->errors_callback_ != nullptr) { queue->errors_callback_(errorCode, queue->public_handle(), queue->errors_data_); } - Signal* signal = queue->exception_signal_; - queue->exceptionState = ERROR_HANDLER_DONE; - signal->StoreRelease(0); - return false; + return exceptionHandlerDone(); } hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) {