[ROCR-Runtime] [ROCProfiler-SDK] Fixing the copy back to the original buffer malformed packets (#2185)

* Fixing the copy back to the original buffer malformed packets

* Addressing Copilot Comments

* Addressing Review comments

* Adjust staging buffer size allocation

Change staging buffer size to match the number of packets.
Tento commit je obsažen v:
Ammar ELWazir
2025-12-05 10:58:01 -06:00
odevzdal GitHub
rodič 3b875cc0ee
revize 9baa65a8b7
2 změnil soubory, kde provedl 22 přidání a 2 odebrání
@@ -241,6 +241,9 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSi
// Proxy packet buffer
SharedArray<AqlPacket, 4096> buffer_;
// Pre-allocated staging buffer for wrap-around cases
std::vector<AqlPacket> staging_buffer_;
// Packet transform callbacks
std::vector<std::pair<AMD::callback_t<hsa_amd_queue_intercept_handler>, void*>> interceptors;
@@ -127,6 +127,9 @@ InterceptQueue::InterceptQueue(std::unique_ptr<Queue> queue)
buffer_ = SharedArray<AqlPacket, 4096>(wrapped->amd_queue_.hsa_queue.size);
amd_queue_.hsa_queue.base_address = reinterpret_cast<void*>(&buffer_[0]);
// Pre-allocate staging buffer with queue size
staging_buffer_.resize(256);
// Fill the ring buffer with invalid packet headers.
// Leave packet content uninitialized to help trigger application errors.
for (uint32_t pkt_id = 0; pkt_id < wrapped->amd_queue_.hsa_queue.size; ++pkt_id) {
@@ -398,8 +401,22 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) {
Cursor.interceptor_index = interceptors.size() - 1;
Cursor.pkt_index = next_packet_;
auto& handler = interceptors[Cursor.interceptor_index];
handler.first(&ring[next_packet_ & mask], packet_count, next_packet_,
handler.second, PacketWriter);
// Check if packets wrap around the ring buffer boundary using unmasked indices.
// The interceptor callback expects packets to be contiguous in memory.
if ((next_packet_ + packet_count) > ((next_packet_ & ~mask) + amd_queue_.hsa_queue.size)) {
// Packets wrap around - use pre-allocated staging buffer
for (uint64_t j = 0; j < packet_count; ++j) {
staging_buffer_[j] = ring[(next_packet_ + j) & mask];
}
handler.first(staging_buffer_.data(), packet_count, next_packet_,
handler.second, PacketWriter);
} else {
// Packets are contiguous in the ring buffer
handler.first(&ring[next_packet_ & mask], packet_count, next_packet_,
handler.second, PacketWriter);
}
if (IsDeviceMemRingBuf() && needsPcieOrdering()) {
// Ensure the packet body is written as header may get reordered when writing over PCIE
_mm_sfence();