From ed5b2ac1654ce2ccfc8194dee5d8566846bfd33b Mon Sep 17 00:00:00 2001 From: Benjamin Welton Date: Tue, 9 Sep 2025 14:06:29 -0700 Subject: [PATCH] Fix deadlock in InterceptQueue::Submit when packet count exceeds queue capacity (#855) InterceptQueue::Submit had an "all-or-nothing" packet submission policy that could cause infinite retry loops when the number of packets to submit exceeded the available queue slots. When 504+ packets needed submission to a ~500-slot queue, the system would: 1. Set submitted_count=0 (submit nothing) 2. Add retry barrier packet 3. Trigger async handler via StoreRelaxed 4. Attempt to submit overflow packets 5. Fail again due to same space constraints 6. Repeat Solution: Added partial packet submission capability during overflow processing while preserving the original "all-or-nothing" behavior for normal operations. When processing overflow packets and insufficient space exists for all packets, the system now submits as many packets as possible rather than none. The fix: - Detects overflow processing via !overflow_.empty() - Allows partial submission: submitted_count = free_slots - barrier_reservation - Maintains atomicity guarantees for normal packet rewrites - Prevents infinite retry loops by ensuring forward progress This resolves deadlocks in high-throughput scenarios while maintaining backward compatibility and the original design intent for packet rewrite atomicity. --- .../runtime/hsa-runtime/core/runtime/intercept_queue.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp index 4f4a44bf82..f707015b9c 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp @@ -240,7 +240,13 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) { // defer packet insertion. Always make sure there is a free slot available // for the retry barrier packet if there is not already one present. else if (free_slots < submitted_count + (pending_retry_point ? 0 : 1)) { - submitted_count = 0; + // If we're in overflow processing (retry mechanism) and still can't fit all packets, + // submit as many as possible to make progress and avoid infinite retry loops + if (!overflow_.empty() && free_slots > (pending_retry_point ? 1 : 2)) { + submitted_count = free_slots - (pending_retry_point ? 0 : 1); + } else { + submitted_count = 0; + } } // If we are not submitting all the packets, we need to ensure there is a