From 9baa65a8b71f4a0d99e7fdee4ff00a7c5ac66923 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Fri, 5 Dec 2025 10:58:01 -0600 Subject: [PATCH] [ROCR-Runtime] [ROCProfiler-SDK] Fixing the copy back to the original buffer malformed packets (#2185) * Fixing the copy back to the original buffer malformed packets * Addressing Copilot Comments * Addressing Review comments * Adjust staging buffer size allocation Change staging buffer size to match the number of packets. --- .../hsa-runtime/core/inc/intercept_queue.h | 3 +++ .../core/runtime/intercept_queue.cpp | 21 +++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h index b840ec4a73..95d7259b30 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/intercept_queue.h @@ -241,6 +241,9 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSi // Proxy packet buffer SharedArray buffer_; + // Pre-allocated staging buffer for wrap-around cases + std::vector staging_buffer_; + // Packet transform callbacks std::vector, void*>> interceptors; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp index 2acdffa415..ebae2fce0e 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp @@ -127,6 +127,9 @@ InterceptQueue::InterceptQueue(std::unique_ptr queue) buffer_ = SharedArray(wrapped->amd_queue_.hsa_queue.size); amd_queue_.hsa_queue.base_address = reinterpret_cast(&buffer_[0]); + // Pre-allocate staging buffer with queue size + staging_buffer_.resize(256); + // Fill the ring buffer with invalid packet headers. // Leave packet content uninitialized to help trigger application errors. for (uint32_t pkt_id = 0; pkt_id < wrapped->amd_queue_.hsa_queue.size; ++pkt_id) { @@ -398,8 +401,22 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) { Cursor.interceptor_index = interceptors.size() - 1; Cursor.pkt_index = next_packet_; auto& handler = interceptors[Cursor.interceptor_index]; - handler.first(&ring[next_packet_ & mask], packet_count, next_packet_, - handler.second, PacketWriter); + + // Check if packets wrap around the ring buffer boundary using unmasked indices. + // The interceptor callback expects packets to be contiguous in memory. + if ((next_packet_ + packet_count) > ((next_packet_ & ~mask) + amd_queue_.hsa_queue.size)) { + // Packets wrap around - use pre-allocated staging buffer + for (uint64_t j = 0; j < packet_count; ++j) { + staging_buffer_[j] = ring[(next_packet_ + j) & mask]; + } + handler.first(staging_buffer_.data(), packet_count, next_packet_, + handler.second, PacketWriter); + } else { + // Packets are contiguous in the ring buffer + handler.first(&ring[next_packet_ & mask], packet_count, next_packet_, + handler.second, PacketWriter); + } + if (IsDeviceMemRingBuf() && needsPcieOrdering()) { // Ensure the packet body is written as header may get reordered when writing over PCIE _mm_sfence();