diff --git a/projects/clr/rocclr/device/rocm/roccounters.cpp b/projects/clr/rocclr/device/rocm/roccounters.cpp index c10ae9896e..4b7b6c4ef4 100644 --- a/projects/clr/rocclr/device/rocm/roccounters.cpp +++ b/projects/clr/rocclr/device/rocm/roccounters.cpp @@ -431,14 +431,6 @@ PerfCounter::PerfCounter(const Device& device, //!< A ROC device object // these block indices are valid for the SI (Gfx8) & Gfx9 devices switch (roc_device_.isa().versionMajor()) { - case (8): - gfxVersion_ = ROC_GFX8; - if (blockIndex < viBlockIdOrcaToRocr.size()) { - auto p = viBlockIdOrcaToRocr[blockIndex]; - event_.block_name = std::get<0>(p); - event_.block_index = std::get<1>(p); - } - break; case (9): gfxVersion_ = ROC_GFX9; if (blockIndex < gfx9BlockIdOrcaToRocr.size()) { diff --git a/projects/clr/rocclr/device/rocm/roccounters.hpp b/projects/clr/rocclr/device/rocm/roccounters.hpp index 494d3e07b6..9d7ca002a9 100644 --- a/projects/clr/rocclr/device/rocm/roccounters.hpp +++ b/projects/clr/rocclr/device/rocm/roccounters.hpp @@ -36,7 +36,6 @@ class PerfCounter : public device::PerfCounter { public: enum { ROC_UNSUPPORTED = 0, - ROC_GFX8, ROC_GFX9, ROC_GFX10 }; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 54c6da78d2..2e42a88778 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -832,13 +832,13 @@ static inline void packet_store_release(uint32_t* packet, uint16_t header, uint1 // ================================================================================================ template bool VirtualGPU::dispatchGenericAqlPacket( - AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, size_t size) { + AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking) { const uint32_t queueSize = gpu_queue_->size; const uint32_t queueMask = queueSize - 1; const uint32_t sw_queue_size = queueMask; // Check for queue full and wait if needed. - uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size); + uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); if (addSystemScope_) { header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE | @@ -887,43 +887,38 @@ bool VirtualGPU::dispatchGenericAqlPacket( blocking = true; } - // Insert packet(s) - // NOTE: need multiple packets to dispatch the performance counter - // packet blob of the legacy devices (gfx8) - for (uint i = 0; i < size; i++, index++, packet++) { - AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask]; - *aql_loc = *packet; - if (header != 0) { - packet_store_release(reinterpret_cast(aql_loc), header, rest); - } - ClPrint(amd::LOG_DEBUG, amd::LOG_AQL, - "SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = " - "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), " - "setup=%d, grid=[%zu, %zu, %zu], workgroup=[%zu, %zu, %zu], private_seg_size=%zu, " - "group_seg_size=%zu, kernel_obj=0x%zx, kernarg_address=0x%zx, completion_signal=0x%zx " - "rptr=%u, wptr=%u", - gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header, - extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE), - extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER), - extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, - HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), - extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, - HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), - rest, reinterpret_cast(packet)->grid_size_x, - reinterpret_cast(packet)->grid_size_y, - reinterpret_cast(packet)->grid_size_z, - reinterpret_cast(packet)->workgroup_size_x, - reinterpret_cast(packet)->workgroup_size_y, - reinterpret_cast(packet)->workgroup_size_z, - reinterpret_cast(packet)->private_segment_size, - reinterpret_cast(packet)->group_segment_size, - reinterpret_cast(packet)->kernel_object, - reinterpret_cast(packet)->kernarg_address, - reinterpret_cast(packet)->completion_signal, read, - index); + AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask]; + *aql_loc = *packet; + if (header != 0) { + packet_store_release(reinterpret_cast(aql_loc), header, rest); } + ClPrint(amd::LOG_DEBUG, amd::LOG_AQL, + "SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = " + "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), " + "setup=%d, grid=[%zu, %zu, %zu], workgroup=[%zu, %zu, %zu], private_seg_size=%zu, " + "group_seg_size=%zu, kernel_obj=0x%zx, kernarg_address=0x%zx, completion_signal=0x%zx " + "rptr=%u, wptr=%u", + gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header, + extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE), + extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER), + extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), + extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), + rest, reinterpret_cast(packet)->grid_size_x, + reinterpret_cast(packet)->grid_size_y, + reinterpret_cast(packet)->grid_size_z, + reinterpret_cast(packet)->workgroup_size_x, + reinterpret_cast(packet)->workgroup_size_y, + reinterpret_cast(packet)->workgroup_size_z, + reinterpret_cast(packet)->private_segment_size, + reinterpret_cast(packet)->group_segment_size, + reinterpret_cast(packet)->kernel_object, + reinterpret_cast(packet)->kernarg_address, + reinterpret_cast(packet)->completion_signal, read, + index); - hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index - 1); + hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index); // Mark the flag indicating if a dispatch is outstanding. // We are not waiting after every dispatch. @@ -984,9 +979,8 @@ inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCom profilingBegin(*vcmd, true); } dispatchBlockingWait(); - constexpr size_t kPacketSize = 1; auto packet = reinterpret_cast(aqlpacket); - dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize); + dispatchGenericAqlPacket(packet, packet->header, packet->setup, false); if (vcmd != nullptr) { profilingEnd(*vcmd); } @@ -1003,13 +997,6 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, // In GFX8 the PM4 IB packet blob is writing directly to AQL queue // In GFX9 the PM4 IB is submitting by AQL Vendor Specific packet and switch (gfxVersion) { - case PerfCounter::ROC_GFX8: - { // Create legacy devices PM4 data - hsa_ext_amd_aql_pm4_packet_t pm4Packet[SLOT_PM4_SIZE_AQLP]; - extApi->hsa_ven_amd_aqlprofile_legacy_get_pm4(packet, static_cast(&pm4Packet[0])); - return dispatchGenericAqlPacket(&pm4Packet[0], 0, 0, blocking, SLOT_PM4_SIZE_AQLP); - } - break; case PerfCounter::ROC_GFX9: case PerfCounter::ROC_GFX10: { @@ -3468,9 +3455,8 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) { uint8_t* aqlPacket = vcmd.getLastPacket(); if (aqlPacket != nullptr) { dispatchBlockingWait(); - constexpr size_t kPacketSize = 1; auto packet = reinterpret_cast(aqlPacket); - dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize); + dispatchGenericAqlPacket(packet, packet->header, packet->setup, false); // We need to set fence_dirty_ flag as we would use a dispatch packet with a completion signal // to track graph finish for the last. The sync logic assumes HW event to a barrier packet that // has a system scope release. This would cause isFenceDirty() check at top level to insert diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index c5f8fa4df2..ea4e9943c9 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -433,8 +433,7 @@ class VirtualGPU : public device::VirtualDevice { bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true); template bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, - uint16_t rest, bool blocking, - size_t size = 1); + uint16_t rest, bool blocking); void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false, hsa_signal_t signal = hsa_signal_t{0});