From 7d84abbc3b37edc72b0274ff68bbcee46630b555 Mon Sep 17 00:00:00 2001
From: David Yat Sin <David.YatSin@amd.com>
Date: Fri, 27 Sep 2024 15:51:28 +0000
Subject: [PATCH] rocr: find first dispatch pkt that needs scratch

On GPUs where EOP is handled in asic, the read_dispatch_id is not always
updated after each packet. Look for the first dispatch packet that needs
scratch memory before allocating scratch.

Change-Id: Ibf4b4b485f99bf2fabfe48e9609ca99111fdafbe


[ROCm/ROCR-Runtime commit: d90fbee9c4673b3da2329ed09c4792488e7b3a40]
---
 .../runtime/hsa-runtime/core/inc/queue.h      | 13 +++--
 .../core/runtime/amd_aql_queue.cpp            | 54 +++++++++++++------
 2 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h
index 43bbe13e76..34bb1de7d7 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h
@@ -90,18 +90,17 @@ struct AqlPacket {
             (type(header) != HSA_PACKET_TYPE_INVALID));
   }
 
-  void __forceinline AssertIsDispatchAndNeedsScratch() const {
+  bool __forceinline IsDispatchAndNeedsScratch() const {
     assert(IsValid(packet.header) && "Invalid packet in dynamic scratch handler.");
-    assert(type(packet.header) == HSA_PACKET_TYPE_KERNEL_DISPATCH &&
-           "Invalid packet in dynamic scratch handler.");
+
+    if (type(packet.header) != HSA_PACKET_TYPE_KERNEL_DISPATCH ||
+        dispatch.private_segment_size == 0)
+      return false;
 
     assert((dispatch.workgroup_size_x != 0) && (dispatch.workgroup_size_y != 0) &&
            (dispatch.workgroup_size_z != 0) && "Invalid dispatch dimension.");
 
-    assert((dispatch.private_segment_size != 0) &&
-           "Scratch memory request from packet with no scratch demand.  Possible bad kernel code "
-           "object.");
-    return;
+    return true;
   }
 
   std::string string() const {
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
index f6a08c337d..a6843b92bb 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
@@ -960,9 +960,27 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code,
    *
    *******************************************************************************************/
 
-  const auto& dispatch_id = amd_queue_.read_dispatch_id;
-  tool::notify_event_scratch_alloc_start(public_handle(), HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE,
-                                         dispatch_id);
+  core::AqlPacket *pkt = NULL;
+  uint64_t dispatch_id = UINT64_MAX;
+
+  auto get_dispatch_pkt = [&]() {
+    dispatch_id = amd_queue_.read_dispatch_id;
+    do {
+      // On GPUs where EOP is handled in asic, the read_dispatch_id is not
+      // updated after each packet so look for the first dispatch that needs
+      // scratch
+      const uint64_t pkt_slot_idx =
+          dispatch_id & (amd_queue_.hsa_queue.size - 1);
+
+      core::AqlPacket *dispatch_pkt =
+          &((core::AqlPacket *)amd_queue_.hsa_queue.base_address)[pkt_slot_idx];
+      if (dispatch_pkt->IsDispatchAndNeedsScratch()) return dispatch_pkt;
+
+      dispatch_id++;
+    } while (dispatch_id <= LoadWriteIndexRelaxed());
+
+    return (core::AqlPacket *)NULL;
+  };
 
   auto calc_dispatch_waves_per_group = [&](core::AqlPacket& pkt) {
     const uint64_t lanes_per_group =
@@ -1022,15 +1040,16 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code,
 
   scratch.cooperative = (amd_queue_.hsa_queue.type == HSA_QUEUE_TYPE_COOPERATIVE);
 
-  uint64_t pkt_slot_idx = dispatch_id & (amd_queue_.hsa_queue.size - 1);
+  pkt = get_dispatch_pkt(); // Sets dispatch_id
+  assert((pkt && dispatch_id != UINT64_MAX) &&
+         "Could not find dispatch packet with private_segment_size > 0");
 
-  core::AqlPacket& pkt = ((core::AqlPacket*)amd_queue_.hsa_queue.base_address)[pkt_slot_idx];
-
-  pkt.AssertIsDispatchAndNeedsScratch();
+  tool::notify_event_scratch_alloc_start(
+      public_handle(), HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE, dispatch_id);
 
   uint32_t device_slots = calc_device_slots();
-  uint32_t groups = calc_dispatch_groups(pkt);
-  uint32_t waves_per_group = calc_dispatch_waves_per_group(pkt);
+  uint32_t groups = calc_dispatch_groups(*pkt);
+  uint32_t waves_per_group = calc_dispatch_waves_per_group(*pkt);
 
   uint32_t dispatch_slots = groups * waves_per_group;
   dispatch_slots = std::min(dispatch_slots, device_slots);
@@ -1038,7 +1057,8 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code,
   const uint64_t lanes_per_wave = (error_code & 0x400) ? 32 : 64;
 
   const uint64_t size_per_thread =
-      AlignUp(pkt.dispatch.private_segment_size, scratch.mem_alignment_size / lanes_per_wave);
+      AlignUp(pkt->dispatch.private_segment_size,
+              scratch.mem_alignment_size / lanes_per_wave);
   const uint64_t device_size = size_per_thread * lanes_per_wave * device_slots;
   const uint64_t dispatch_size = size_per_thread * lanes_per_wave * dispatch_slots;
 
@@ -1054,9 +1074,9 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code,
 
     agent_->AcquireQueueAltScratch(scratch);
     if (scratch.alt_queue_base) {
-      scratch.alt_dispatch_limit_x = pkt.dispatch.grid_size_x;
-      scratch.alt_dispatch_limit_y = pkt.dispatch.grid_size_y;
-      scratch.alt_dispatch_limit_z = pkt.dispatch.grid_size_z;
+      scratch.alt_dispatch_limit_x = pkt->dispatch.grid_size_x;
+      scratch.alt_dispatch_limit_y = pkt->dispatch.grid_size_y;
+      scratch.alt_dispatch_limit_z = pkt->dispatch.grid_size_z;
 
       // Update queue SRD
       InitScratchSRD();
@@ -1099,9 +1119,11 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code,
     amd_queue_.queue_properties |= AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE;
     // Set system release fence to flush scratch stores with older firmware versions.
     if ((agent_->isa()->GetMajorVersion() == 8) && (agent_->GetMicrocodeVersion() < 729)) {
-      pkt.dispatch.header &= ~(((1 << HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE) - 1)
-                               << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
-      pkt.dispatch.header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+      pkt->dispatch.header &=
+          ~(((1 << HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE) - 1)
+            << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+      pkt->dispatch.header |=
+          (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
     }
   } else if (scratch.alt_size && scratch.main_size > scratch.alt_size) {
     // Not using use-scratch-once, and dispatches that would fit in alt-scratch would also fit in