diff --git a/projects/clr/rocclr/device/pal/paldevice.hpp b/projects/clr/rocclr/device/pal/paldevice.hpp
index 2c7d0e539d..8292218126 100644
--- a/projects/clr/rocclr/device/pal/paldevice.hpp
+++ b/projects/clr/rocclr/device/pal/paldevice.hpp
@@ -253,15 +253,16 @@ class Device : public NullDevice {
     uint32_t index_;                 //!< HW queue index for scratch buffer access
     amd::Monitor queue_lock_;        //!< Queue lock for access
     AqlPacketMgmt aql_packet_mgmt_;  //!< AQL packets management class for debugger support
-    QueueRecycleInfo()
+    QueueRecycleInfo(const Device& dev)
         : counter_(1),
           engineType_(Pal::EngineTypeCompute),
           index_(0),
-          queue_lock_(true) /* Queue lock for sharing */ {}
+          queue_lock_(true) /* Queue lock for sharing */,
+          aql_packet_mgmt_(dev) {}
 
-    //! Returns the aql packet list
-    uintptr_t AqlPacketList() const {
-      return reinterpret_cast<uintptr_t>(&aql_packet_mgmt_.aql_packets_);
+    //! Returns the MQD's read_dispatch_id's address.
+    uintptr_t DebuggerData() const {
+      return reinterpret_cast<uintptr_t>(&aql_packet_mgmt_.amd_queue_.read_dispatch_id);
     }
   };
 
diff --git a/projects/clr/rocclr/device/pal/palkernel.cpp b/projects/clr/rocclr/device/pal/palkernel.cpp
index 4d51bd2729..b597328c23 100644
--- a/projects/clr/rocclr/device/pal/palkernel.cpp
+++ b/projects/clr/rocclr/device/pal/palkernel.cpp
@@ -172,12 +172,10 @@ const pal::Program& Kernel::prog() const {
   return reinterpret_cast<const pal::Program&>(prog_);
 }
 
-hsa_kernel_dispatch_packet_t* Kernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
-                                                         const amd::NDRangeContainer& sizes,
-                                                         const_address params, size_t ldsAddress,
-                                                         uint64_t vmDefQueue,
-                                                         uint64_t* vmParentWrap,
-                                                         uint32_t* aql_index) const {
+std::pair<hsa_kernel_dispatch_packet_t* /* packet address */, uint64_t /* packet id */>
+HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
+                           const amd::NDRangeContainer& sizes, const_address params,
+			   size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
   // Provide private and local heap addresses
   static constexpr uint AddressShift = LP64_SWITCH(0, 32);
   const_address parameters = params;
@@ -364,7 +362,7 @@ hsa_kernel_dispatch_packet_t* Kernel::loadArguments(VirtualGPU& gpu, const amd::
            std::min(static_cast<uint32_t>(argsBufferSize()), signature.paramsSize()));
   }
 
-  hsa_kernel_dispatch_packet_t* hsaDisp = gpu.GetAqlPacketSlot(aql_index);
+  auto&& [hsaDisp, aql_packet_id] = gpu.GetAqlPacketSlot();
 
   constexpr uint16_t kDispatchPacketHeader =
       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
@@ -401,7 +399,7 @@ hsa_kernel_dispatch_packet_t* Kernel::loadArguments(VirtualGPU& gpu, const amd::
     gpu.addVmMemory(gpu.hsaQueueMem());
   }
 
-  return hsaDisp;
+  return {hsaDisp, aql_packet_id};
 }
 
 bool Kernel::setKernelDescriptor(amd::hsa::loader::Symbol* sym,
diff --git a/projects/clr/rocclr/device/pal/palkernel.hpp b/projects/clr/rocclr/device/pal/palkernel.hpp
index ccbd9a2093..afd2e6b468 100644
--- a/projects/clr/rocclr/device/pal/palkernel.hpp
+++ b/projects/clr/rocclr/device/pal/palkernel.hpp
@@ -104,15 +104,14 @@ class Kernel : public device::Kernel {
 
   //! Returns AQL packet in CPU memory
   //! if the kernel arguments were successfully loaded, otherwise NULL
-  hsa_kernel_dispatch_packet_t* loadArguments(
-      VirtualGPU& gpu,                     //!< Running GPU context
-      const amd::Kernel& kernel,           //!< AMD kernel object
-      const amd::NDRangeContainer& sizes,  //!< NDrange container
-      const_address params,                //!< Application arguments for the kernel
-      size_t ldsAddress,                   //!< LDS address that includes all arguments.
-      uint64_t vmDefQueue,                 //!< GPU VM default queue pointer
-      uint64_t* vmParentWrap,              //!< GPU VM parent aql wrap object
-      uint32_t* aql_index                  //!< AQL packet index in the packets array for debugger
+  std::pair<hsa_kernel_dispatch_packet_t* /* packet address */, uint64_t /* packet id */>
+  loadArguments(VirtualGPU& gpu,                     //!< Running GPU context
+                const amd::Kernel& kernel,           //!< AMD kernel object
+                const amd::NDRangeContainer& sizes,  //!< NDrange container
+                const_address params,                //!< Application arguments for the kernel
+                size_t ldsAddress,                   //!< LDS address that includes all arguments.
+                uint64_t vmDefQueue,                 //!< GPU VM default queue pointer
+                uint64_t* vmParentWrap               //!< GPU VM parent aql wrap object
   ) const;
 
   //! Returns the kernel index in the program
diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp
index 33445a1ceb..587914e8fc 100644
--- a/projects/clr/rocclr/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/device/pal/palvirtual.cpp
@@ -52,6 +52,37 @@
 
 namespace amd::pal {
 
+AqlPacketMgmt::AqlPacketMgmt(const Device& dev) {
+  memset(aql_vgpus_, 0, sizeof(aql_vgpus_));
+
+  static_assert(sizeof(decltype(amd_queue_)::read_dispatch_id) == sizeof(uint64_t));
+  static_assert(sizeof(decltype(amd_queue_)::write_dispatch_id) == sizeof(uint64_t));
+
+  // Initialize the amd_queue_
+  amd_queue_.hsa_queue.type = HSA_QUEUE_TYPE_MULTI;
+  amd_queue_.hsa_queue.features = HSA_QUEUE_FEATURE_KERNEL_DISPATCH;
+  amd_queue_.hsa_queue.base_address = &aql_packets_[0];
+  amd_queue_.hsa_queue.size = sizeof(aql_packets_) / sizeof(aql_packets_[0]);
+  amd_queue_.hsa_queue.id = []() {
+    static std::atomic<uint64_t> queue_counter;
+    return queue_counter++;
+  }();
+  amd_queue_.read_dispatch_id_field_base_byte_offset =
+      offsetof(decltype(amd_queue_), read_dispatch_id) - offsetof(decltype(amd_queue_), hsa_queue);
+
+  amd_queue_.max_cu_id = dev.properties().gfxipProperties.shaderCore.numAvailableCus - 1;
+  amd_queue_.max_wave_id = dev.properties().gfxipProperties.shaderCore.numSimdsPerCu *
+          dev.properties().gfxipProperties.shaderCore.numWavefrontsPerSimd -
+      1;
+
+  amd_queue_.private_segment_aperture_base_hi = static_cast<uint32_t>(
+      dev.properties().gpuMemoryProperties.privateApertureBase >> LP64_SWITCH(0, 32));
+  amd_queue_.group_segment_aperture_base_hi = static_cast<uint32_t>(
+      dev.properties().gpuMemoryProperties.sharedApertureBase >> LP64_SWITCH(0, 32));
+
+  AMD_HSA_BITS_SET(amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_IS_PTR64, LP64_SWITCH(0, 1));
+}
+
 uint32_t VirtualGPU::Queue::AllocedQueues(const VirtualGPU& gpu, Pal::EngineType type) {
   uint32_t allocedQueues = 0;
   for (const auto& queue : gpu.dev().QueuePool()) {
@@ -151,13 +182,13 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
       uint32_t index = AllocedQueues(gpu, qCreateInfo.engineType);
       // Create PAL queue object
       if (index < GPU_MAX_HW_QUEUES) {
-        Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo();
+        Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo(gpu.dev());
         if (info == nullptr) {
           LogError("Could not create QueueRecycleInfo!");
           return nullptr;
         }
         addrQ = reinterpret_cast<address>(&info[1]);
-        qCreateInfo.aqlPacketList = info->AqlPacketList();
+        qCreateInfo.aqlPacketList = info->DebuggerData();
         result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
         if (result == Pal::Result::Success) {
           const_cast<Device&>(gpu.dev()).QueuePool().insert({queue->iQueue_, info});
@@ -193,7 +224,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
       queue->lock_ = &info->queue_lock_;
       addrQ = reinterpret_cast<address>(&queue[1]);
     } else {
-      Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo();
+      Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo(gpu.dev());
       if (info == nullptr) {
         LogError("Could not create QueueRecycleInfo!");
         return nullptr;
@@ -202,7 +233,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
       queue->aql_mgmt_ = &info->aql_packet_mgmt_;
       // Exclusive compute path
       addrQ = reinterpret_cast<address>(&queue[1]);
-      qCreateInfo.aqlPacketList = info->AqlPacketList();
+      qCreateInfo.aqlPacketList = info->DebuggerData();
       result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
     }
     if (result != Pal::Result::Success) {
@@ -1072,7 +1103,7 @@ VirtualGPU::~VirtualGPU() {
   if (queues_[MainEngine] != nullptr) {
     // Clear all timestamps, associated with this virtual GPU
     auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
-    for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
+    for (uint32_t i = 0; i < mgmt.amd_queue_.hsa_queue.size; ++i) {
       if (mgmt.aql_vgpus_[i] == this) {
         mgmt.aql_vgpus_[i] = nullptr;
         mgmt.aql_events_[i].invalidate();
@@ -2688,13 +2719,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
   GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId());
   uint32_t id = gpuEvent.id_;
   uint64_t vmParentWrap = 0;
-  uint32_t aql_index = 0;
   // Program the kernel arguments for the GPU execution
-  hsa_kernel_dispatch_packet_t* aqlPkt =
+  auto&& [aqlPkt, aql_packet_id] =
       hsaKernel.loadArguments(*this, kernel, sizes, parameters, ldsSize + sharedMemBytes,
-                              vmDefQueue, &vmParentWrap, &aql_index);
+                              vmDefQueue, &vmParentWrap);
   assert((nullptr != aqlPkt) && "Couldn't load kernel arguments");
 
+  auto& amd_queue = queues_[MainEngine]->aql_mgmt_->amd_queue_;
+  uint32_t aql_index = aql_packet_id % amd_queue.hsa_queue.size;
+
   // Dynamic call stack size is considered to calculate private segment size and scratch regs
   // in pal::Kernel::postLoad(). As it is not called during hipModuleLaunchKernel unlike
   // hipLaunchKernel/hipLaunchKernelGGL, Updated value is passed to dispatch packet.
@@ -2729,9 +2762,46 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
   dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
   dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
   dispatchParam.aqlPacketIndex = aql_index;
+
+  // Update the mqd's information about scratch memory.
+  amd_queue.scratch_backing_memory_location = static_cast<uint64_t>(dispatchParam.scratchAddr);
+  amd_queue.scratch_backing_memory_byte_size = static_cast<uint64_t>(dispatchParam.scratchSize);
+
+  // FIXME: Conservatively, the read_dispatch_id cannot be smaller than the current aql_packet_id -
+  // hsa_queue.size for the debugger to work correctly. The read_dispatch_id really should be
+  // updated when the CmdBuf is marked as complete.
+  uint64_t new_read_dispatch_id = (aql_packet_id >= amd_queue.hsa_queue.size)
+      ? (aql_packet_id - amd_queue.hsa_queue.size + 1)
+      : 0;
+
+  // Do an atomic max of &amd_queue.read_dispatch_id and new_read_dispatch_id
+  uint64_t old_read_dispatch_id = amd_queue.read_dispatch_id;
+  while (new_read_dispatch_id > old_read_dispatch_id) {
+#if defined(__GNUC__)
+    if (__atomic_compare_exchange_n(&amd_queue.read_dispatch_id, &old_read_dispatch_id,
+                                    new_read_dispatch_id, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED))
+      break;
+#elif defined(_MSC_VER)
+    uint64_t initial_value = InterlockedCompareExchange64(
+        reinterpret_cast<LONG64 volatile*>(&amd_queue.read_dispatch_id), new_read_dispatch_id,
+        old_read_dispatch_id);
+    if (initial_value == old_read_dispatch_id) break;
+    old_read_dispatch_id = initial_value;
+#else  // !defined (_MSV_VER) && !defined(__GNUC__)
+#error Not implemented
+#endif  // !defined (_MSV_VER) && !defined(__GNUC__)
+  }
+
   // Run AQL dispatch in HW
   eventBegin(MainEngine);
+
+#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 954
   iCmd()->CmdDispatchAql(dispatchParam);
+#else  // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 954
+  Pal::DispatchAqlFeedback feedback{};
+  iCmd()->CmdDispatchAql(dispatchParam, &feedback);
+  amd_queue.compute_tmpring_size = feedback.tmpRingSize;
+#endif  // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 954
 
   if (id != gpuEvent.id_) {
     LogError("Something is wrong. ID mismatch!\n");
diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp
index 30d216f7a6..6cc134e270 100644
--- a/projects/clr/rocclr/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/device/pal/palvirtual.hpp
@@ -36,6 +36,11 @@
 #include "palQueue.h"
 #include "palFence.h"
 #include "palLinearAllocator.h"
+#include "amd_hsa_queue.h"
+
+#ifdef _WIN32
+#include <winnt.h>
+#endif  // _WIN32
 
 /*! \addtogroup PAL PAL Resource Implementation
  *  @{
@@ -55,12 +60,13 @@ class Kernel;
 
 struct AqlPacketMgmt : public amd::EmbeddedObject {
   static constexpr uint32_t kAqlPacketsListSize = 4 * Ki;
-  AqlPacketMgmt() : packet_index_(0) { memset(aql_vgpus_, 0, sizeof(aql_vgpus_)); }
+  AqlPacketMgmt(const Device& dev);
 
-  hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize];  //!< The list of AQL packets
+  amd_queue_t amd_queue_{};
+  alignas(sizeof(hsa_kernel_dispatch_packet_t))
+      hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize];  //!< The list of AQL packets
   GpuEvent aql_events_[kAqlPacketsListSize];    //!< The list of gpu for each AQL packet
   VirtualGPU* aql_vgpus_[kAqlPacketsListSize];  //!< The list of vgpus which had submissions
-  std::atomic<uint64_t> packet_index_;          //!< The active packet slot index
 };
 
 enum class BarrierType : uint8_t {
@@ -596,15 +602,26 @@ class VirtualGPU : public device::VirtualDevice {
   }
 
   //! Returns the current active slot for AQL packet
-  hsa_kernel_dispatch_packet_t* GetAqlPacketSlot(uint32_t* index) {
+  std::pair<hsa_kernel_dispatch_packet_t* /* packet address */, uint64_t /* packet id */>
+  GetAqlPacketSlot() const {
     auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
     // Atomic increment global AQL index and wrap around max AQL list size
-    *index = ++mgmt.packet_index_ % AqlPacketMgmt::kAqlPacketsListSize;
-    if (mgmt.aql_events_[*index].isValid()) {
+    uint64_t packet_id =
+#if defined(__GNUC__)
+        __atomic_fetch_add(&mgmt.amd_queue_.write_dispatch_id, 1, __ATOMIC_RELAXED);
+#elif defined(_MSC_VER)
+        InterlockedExchangeAdd64(
+            reinterpret_cast<LONG64 volatile*>(&mgmt.amd_queue_.write_dispatch_id), 1);
+#else  // !defined (_MSV_VER) && !defined(__GNUC__)
+#error Not implemented
+#endif  // !defined (_MSV_VER) && !defined(__GNUC__)
+
+    uint32_t index = packet_id % mgmt.amd_queue_.hsa_queue.size;
+    if (mgmt.aql_events_[index].isValid()) {
       // Make sure GPU doesn't process this slot
-      mgmt.aql_vgpus_[*index]->waitForEvent(&mgmt.aql_events_[*index]);
+      mgmt.aql_vgpus_[index]->waitForEvent(&mgmt.aql_events_[index]);
     }
-    return &mgmt.aql_packets_[*index];
+    return {&mgmt.aql_packets_[index], packet_id};
   }
 
  protected: