From 7d84abbc3b37edc72b0274ff68bbcee46630b555 Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Fri, 27 Sep 2024 15:51:28 +0000 Subject: [PATCH] rocr: find first dispatch pkt that needs scratch On GPUs where EOP is handled in asic, the read_dispatch_id is not always updated after each packet. Look for the first dispatch packet that needs scratch memory before allocating scratch. Change-Id: Ibf4b4b485f99bf2fabfe48e9609ca99111fdafbe [ROCm/ROCR-Runtime commit: d90fbee9c4673b3da2329ed09c4792488e7b3a40] --- .../runtime/hsa-runtime/core/inc/queue.h | 13 +++-- .../core/runtime/amd_aql_queue.cpp | 54 +++++++++++++------ 2 files changed, 44 insertions(+), 23 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h index 43bbe13e76..34bb1de7d7 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h @@ -90,18 +90,17 @@ struct AqlPacket { (type(header) != HSA_PACKET_TYPE_INVALID)); } - void __forceinline AssertIsDispatchAndNeedsScratch() const { + bool __forceinline IsDispatchAndNeedsScratch() const { assert(IsValid(packet.header) && "Invalid packet in dynamic scratch handler."); - assert(type(packet.header) == HSA_PACKET_TYPE_KERNEL_DISPATCH && - "Invalid packet in dynamic scratch handler."); + + if (type(packet.header) != HSA_PACKET_TYPE_KERNEL_DISPATCH || + dispatch.private_segment_size == 0) + return false; assert((dispatch.workgroup_size_x != 0) && (dispatch.workgroup_size_y != 0) && (dispatch.workgroup_size_z != 0) && "Invalid dispatch dimension."); - assert((dispatch.private_segment_size != 0) && - "Scratch memory request from packet with no scratch demand. Possible bad kernel code " - "object."); - return; + return true; } std::string string() const { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index f6a08c337d..a6843b92bb 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -960,9 +960,27 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code, * *******************************************************************************************/ - const auto& dispatch_id = amd_queue_.read_dispatch_id; - tool::notify_event_scratch_alloc_start(public_handle(), HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE, - dispatch_id); + core::AqlPacket *pkt = NULL; + uint64_t dispatch_id = UINT64_MAX; + + auto get_dispatch_pkt = [&]() { + dispatch_id = amd_queue_.read_dispatch_id; + do { + // On GPUs where EOP is handled in asic, the read_dispatch_id is not + // updated after each packet so look for the first dispatch that needs + // scratch + const uint64_t pkt_slot_idx = + dispatch_id & (amd_queue_.hsa_queue.size - 1); + + core::AqlPacket *dispatch_pkt = + &((core::AqlPacket *)amd_queue_.hsa_queue.base_address)[pkt_slot_idx]; + if (dispatch_pkt->IsDispatchAndNeedsScratch()) return dispatch_pkt; + + dispatch_id++; + } while (dispatch_id <= LoadWriteIndexRelaxed()); + + return (core::AqlPacket *)NULL; + }; auto calc_dispatch_waves_per_group = [&](core::AqlPacket& pkt) { const uint64_t lanes_per_group = @@ -1022,15 +1040,16 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code, scratch.cooperative = (amd_queue_.hsa_queue.type == HSA_QUEUE_TYPE_COOPERATIVE); - uint64_t pkt_slot_idx = dispatch_id & (amd_queue_.hsa_queue.size - 1); + pkt = get_dispatch_pkt(); // Sets dispatch_id + assert((pkt && dispatch_id != UINT64_MAX) && + "Could not find dispatch packet with private_segment_size > 0"); - core::AqlPacket& pkt = ((core::AqlPacket*)amd_queue_.hsa_queue.base_address)[pkt_slot_idx]; - - pkt.AssertIsDispatchAndNeedsScratch(); + tool::notify_event_scratch_alloc_start( + public_handle(), HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE, dispatch_id); uint32_t device_slots = calc_device_slots(); - uint32_t groups = calc_dispatch_groups(pkt); - uint32_t waves_per_group = calc_dispatch_waves_per_group(pkt); + uint32_t groups = calc_dispatch_groups(*pkt); + uint32_t waves_per_group = calc_dispatch_waves_per_group(*pkt); uint32_t dispatch_slots = groups * waves_per_group; dispatch_slots = std::min(dispatch_slots, device_slots); @@ -1038,7 +1057,8 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code, const uint64_t lanes_per_wave = (error_code & 0x400) ? 32 : 64; const uint64_t size_per_thread = - AlignUp(pkt.dispatch.private_segment_size, scratch.mem_alignment_size / lanes_per_wave); + AlignUp(pkt->dispatch.private_segment_size, + scratch.mem_alignment_size / lanes_per_wave); const uint64_t device_size = size_per_thread * lanes_per_wave * device_slots; const uint64_t dispatch_size = size_per_thread * lanes_per_wave * dispatch_slots; @@ -1054,9 +1074,9 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code, agent_->AcquireQueueAltScratch(scratch); if (scratch.alt_queue_base) { - scratch.alt_dispatch_limit_x = pkt.dispatch.grid_size_x; - scratch.alt_dispatch_limit_y = pkt.dispatch.grid_size_y; - scratch.alt_dispatch_limit_z = pkt.dispatch.grid_size_z; + scratch.alt_dispatch_limit_x = pkt->dispatch.grid_size_x; + scratch.alt_dispatch_limit_y = pkt->dispatch.grid_size_y; + scratch.alt_dispatch_limit_z = pkt->dispatch.grid_size_z; // Update queue SRD InitScratchSRD(); @@ -1099,9 +1119,11 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code, amd_queue_.queue_properties |= AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE; // Set system release fence to flush scratch stores with older firmware versions. if ((agent_->isa()->GetMajorVersion() == 8) && (agent_->GetMicrocodeVersion() < 729)) { - pkt.dispatch.header &= ~(((1 << HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE) - 1) - << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); - pkt.dispatch.header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); + pkt->dispatch.header &= + ~(((1 << HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE) - 1) + << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); + pkt->dispatch.header |= + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); } } else if (scratch.alt_size && scratch.main_size > scratch.alt_size) { // Not using use-scratch-once, and dispatches that would fit in alt-scratch would also fit in