[ROCR-Runtime] [ROCProfiler-SDK] Fixing the copy back to the original buffer malformed packets (#2185)
* Fixing the copy back to the original buffer malformed packets * Addressing Copilot Comments * Addressing Review comments * Adjust staging buffer size allocation Change staging buffer size to match the number of packets.
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
3b875cc0ee
Коммит
9baa65a8b7
@@ -241,6 +241,9 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSi
|
||||
// Proxy packet buffer
|
||||
SharedArray<AqlPacket, 4096> buffer_;
|
||||
|
||||
// Pre-allocated staging buffer for wrap-around cases
|
||||
std::vector<AqlPacket> staging_buffer_;
|
||||
|
||||
// Packet transform callbacks
|
||||
std::vector<std::pair<AMD::callback_t<hsa_amd_queue_intercept_handler>, void*>> interceptors;
|
||||
|
||||
|
||||
@@ -127,6 +127,9 @@ InterceptQueue::InterceptQueue(std::unique_ptr<Queue> queue)
|
||||
buffer_ = SharedArray<AqlPacket, 4096>(wrapped->amd_queue_.hsa_queue.size);
|
||||
amd_queue_.hsa_queue.base_address = reinterpret_cast<void*>(&buffer_[0]);
|
||||
|
||||
// Pre-allocate staging buffer with queue size
|
||||
staging_buffer_.resize(256);
|
||||
|
||||
// Fill the ring buffer with invalid packet headers.
|
||||
// Leave packet content uninitialized to help trigger application errors.
|
||||
for (uint32_t pkt_id = 0; pkt_id < wrapped->amd_queue_.hsa_queue.size; ++pkt_id) {
|
||||
@@ -398,8 +401,22 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) {
|
||||
Cursor.interceptor_index = interceptors.size() - 1;
|
||||
Cursor.pkt_index = next_packet_;
|
||||
auto& handler = interceptors[Cursor.interceptor_index];
|
||||
handler.first(&ring[next_packet_ & mask], packet_count, next_packet_,
|
||||
handler.second, PacketWriter);
|
||||
|
||||
// Check if packets wrap around the ring buffer boundary using unmasked indices.
|
||||
// The interceptor callback expects packets to be contiguous in memory.
|
||||
if ((next_packet_ + packet_count) > ((next_packet_ & ~mask) + amd_queue_.hsa_queue.size)) {
|
||||
// Packets wrap around - use pre-allocated staging buffer
|
||||
for (uint64_t j = 0; j < packet_count; ++j) {
|
||||
staging_buffer_[j] = ring[(next_packet_ + j) & mask];
|
||||
}
|
||||
handler.first(staging_buffer_.data(), packet_count, next_packet_,
|
||||
handler.second, PacketWriter);
|
||||
} else {
|
||||
// Packets are contiguous in the ring buffer
|
||||
handler.first(&ring[next_packet_ & mask], packet_count, next_packet_,
|
||||
handler.second, PacketWriter);
|
||||
}
|
||||
|
||||
if (IsDeviceMemRingBuf() && needsPcieOrdering()) {
|
||||
// Ensure the packet body is written as header may get reordered when writing over PCIE
|
||||
_mm_sfence();
|
||||
|
||||
Ссылка в новой задаче
Block a user