diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp
index c64523b3dc..b23521e1fb 100644
--- a/projects/clr/hipamd/src/hip_graph_internal.cpp
+++ b/projects/clr/hipamd/src/hip_graph_internal.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 - 2021 Advanced Micro Devices, Inc.
+/* Copyright (c) 2021 - 2025 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -394,9 +394,20 @@ void GraphExec::GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph) {
 }
 
 // ================================================================================================
-hipError_t GraphExec::AllocKernelArgForGraphNode() {
+hipError_t GraphExec::CaptureAndFormPacketsForGraph() {
   hipError_t status = hipSuccess;
-  for (auto& node : topoOrder_) {
+
+  // Clear previous capture status and batches
+  nodeCaptureStatus_.clear();
+  nodeCaptureStatus_.resize(topoOrder_.size(), false);
+
+  // Clear previous batches
+  packetBatches_.clear();
+
+  // Process nodes and create batches of consecutive captured nodes
+  for (size_t i = 0; i < topoOrder_.size(); ++i) {
+    auto& node = topoOrder_[i];
+
     if (node->GetType() == hipGraphNodeTypeKernel) {
       // Check if graph requires hidden heap and set as part of graphExec param.
       static bool initialized = false;
@@ -405,19 +416,51 @@ hipError_t GraphExec::AllocKernelArgForGraphNode() {
         initialized = true;
       }
     }
+
     if (node->GraphCaptureEnabled()) {
-      status = node->CaptureAndFormPacket(GetKernelArgManager());
+      // Start of a potential batch - try to capture packets for this node
+      std::vector<uint8_t*> currentBatch;
+      std::vector<std::string> currentKernelNames;
+
+      // Collect packets from consecutive captured nodes
+      size_t j = i;
+      size_t capturedNodeCount = 0;
+      while (j < topoOrder_.size() && topoOrder_[j]->GraphCaptureEnabled()) {
+        auto& currentNode = topoOrder_[j];
+        status = currentNode->CaptureAndFormPacket(GetKernelArgManager(), &currentBatch,
+                                                             &currentKernelNames);
+
+        if (status != hipSuccess || currentBatch.empty()) {
+          LogError("Packet capture failed");
+          return status;
+        }
+        // Mark this node as successfully captured
+        nodeCaptureStatus_[j] = true;
+        ++j;
+        ++capturedNodeCount;
+      }
+
+      // Add the batch if it has packets
+      if (!currentBatch.empty()) {
+        packetBatches_.emplace_back(std::move(currentBatch), std::move(currentKernelNames),
+                                    capturedNodeCount);
+      }
+
+      // Skip the nodes we just processed, the index will be incremented by the loop
+      i = j - 1;
     } else if (node->GetType() == hipGraphNodeTypeGraph) {
       auto childNode = reinterpret_cast<hip::ChildGraphNode*>(node);
       if (childNode->GetChildGraph()->max_streams_ == 1) {
         childNode->SetGraphCaptureStatus(true);
-        status = childNode->AllocKernelArgForGraphNode();
+        status = childNode->CaptureAndFormPacketsForGraph();
+        nodeCaptureStatus_[i] = (status == hipSuccess);
         if (status != hipSuccess) {
-          return status;
+          status = hipSuccess; // Continue with other nodes
         }
       }
     }
   }
+
   return status;
 }
 
@@ -427,8 +470,8 @@ hipError_t GraphExec::CaptureAQLPackets() {
   size_t kernArgSizeForGraph = 0;
   GetKernelArgSizeForGraph(kernArgSizeForGraph);
   // When we support multi device graph lauch we need to allocate the kenel args on respective
-  // device for each kernel Assume graph has nodes of same device allocate kernel args on the device
-  // from the first node
+  // device for each kernel Assume graph has nodes of same device allocate kernel args on the
+  // device from the first node
   auto device = g_devices[topoOrder_[0]->GetDeviceId()]->devices()[0];
   // Add a larger initial pool to accomodate for any updates to kernel args
   bool bStatus =
@@ -437,10 +480,11 @@ hipError_t GraphExec::CaptureAQLPackets() {
     return hipErrorMemoryAllocation;
   }
 
-  status = AllocKernelArgForGraphNode();
+  status = CaptureAndFormPacketsForGraph();
   if (status != hipSuccess) {
     return status;
   }
+
   kernArgManager_->ReadBackOrFlush();
   return status;
 }
@@ -471,18 +515,37 @@ hipError_t GraphExec::EnqueueGraphWithSingleList(hip::Stream* hip_stream) {
   if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
     accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr);
   }
-  for (int i = 0; i < topoOrder_.size(); i++) {
-    if (topoOrder_[i]->GraphCaptureEnabled()) {
-      if (topoOrder_[i]->GetEnabled()) {
-        std::vector<uint8_t*>& gpuPackets = topoOrder_[i]->GetAqlPackets();
-        for (auto& packet : gpuPackets) {
-          hip_stream->vdev()->dispatchAqlPacket(packet, topoOrder_[i]->GetKernelName(), accumulate);
+
+  size_t batchIndex = 0;
+
+  // Process nodes in topological order with mixed execution strategy
+  for (size_t i = 0; i < topoOrder_.size(); ++i) {
+    auto& node = topoOrder_[i];
+
+    if (!node->GraphCaptureEnabled()) {
+      // Node doesn't support capture - execute individually
+      node->SetStream(hip_stream);
+      status = node->CreateCommand(node->GetQueue());
+      node->EnqueueCommands(hip_stream);
+    } else if (i < nodeCaptureStatus_.size() && nodeCaptureStatus_[i]) {
+      // Node was successfully captured - find which batch it belongs to
+      // and dispatch the entire batch
+      if (batchIndex < packetBatches_.size()) {
+        // Dispatch this batch
+        bool batchStatus = hip_stream->vdev()->dispatchAqlPacketBatch(
+            packetBatches_[batchIndex].packets, packetBatches_[batchIndex].kernelNames, accumulate);
+        if (!batchStatus) {
+          status = hipErrorUnknown;
+          accumulate->release();
+          return status;
         }
+
+        // Skip all consecutive captured nodes that belong to this batch
+        // Use the tracked node count to skip directly instead of parsing one by one
+        i += packetBatches_[batchIndex].capturedNodeCount - 1;  // -1 because loop will increment
+
+        ++batchIndex;
       }
-    } else {
-      topoOrder_[i]->SetStream(hip_stream);
-      status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
-      topoOrder_[i]->EnqueueCommands(hip_stream);
     }
   }
 
diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp
index babcb5f87c..c6767508c9 100644
--- a/projects/clr/hipamd/src/hip_graph_internal.hpp
+++ b/projects/clr/hipamd/src/hip_graph_internal.hpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc.
+/* Copyright (c) 2021 - 2025 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -245,15 +245,22 @@ class GraphNode : public hipGraphNodeDOTAttribute {
   size_t GetKerArgSize() const { return alignedKernArgSize_; }
   size_t GetKernargSegmentByteSize() const { return kernargSegmentByteSize_; }
   size_t GetKernargSegmentAlignment() const { return kernargSegmentAlignment_; }
-  hipError_t CaptureAndFormPacket(GraphKernelArgManager* kernArgMgr) {
+
+  //! Capture packets and accumulate them into a batch if provided
+  hipError_t CaptureAndFormPacket(GraphKernelArgManager* kernArgMgr,
+                                  std::vector<uint8_t*>* batchPackets = nullptr,
+                                  std::vector<std::string>* batchKernelNames = nullptr) {
     auto capture_stream = hip::getNullStream(g_devices[dev_id_]->devices()[0]->context(), false);
     hipError_t status = CreateCommand(capture_stream);
     if (status != hipSuccess) {
       return status;
     }
 
+    // Release last created packet memory before they are overwritten with new packets
     std::for_each(gpuPackets_.begin(), gpuPackets_.end(), [](auto p) { delete[] p; });
+    // Clear the pointer array
     gpuPackets_.clear();
+
     for (auto& command : commands_) {
       command->setPktCapturingState(true, &gpuPackets_, kernArgMgr, &capturedKernelName_);
       // Enqueue command to capture GPU Packet. The packet is not submitted to the device.
@@ -261,6 +268,15 @@ class GraphNode : public hipGraphNodeDOTAttribute {
       command->submit(*(command->queue())->vdev());
       command->release();
     }
+
+    // Accumulate packets directly into the batch (only if batch vectors are provided)
+    if (batchPackets != nullptr && batchKernelNames != nullptr) {
+      for (auto& packet : gpuPackets_) {
+        batchPackets->push_back(packet);
+        batchKernelNames->push_back(capturedKernelName_);
+      }
+    }
+
     // Commands are captured and released. Clear them from the object.
     commands_.clear();
 
@@ -814,6 +830,9 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
     if (instantiateDeviceId_ != -1) {
       static_cast<ReferenceCountedObject*>(g_devices[instantiateDeviceId_])->release();
     }
+
+    packetBatches_.clear();
+    nodeCaptureStatus_.clear();
   }
 
   Node GetClonedNode(Node node) {
@@ -849,7 +868,7 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
   }
   GraphKernelArgManager* GetKernelArgManager() { return kernArgManager_; }
   static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data);
-  hipError_t AllocKernelArgForGraphNode();
+  hipError_t CaptureAndFormPacketsForGraph();
   void GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph);
   hipError_t EnqueueGraphWithSingleList(hip::Stream* hip_stream);
   bool TopologicalOrder() { return Graph::TopologicalOrder(topoOrder_); }
@@ -863,6 +882,23 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
   int instantiateDeviceId_ = -1;
   bool hasHiddenHeap_ = false;  //!< Hidden heap indicator for Kernel node
   bool repeatLaunch_ = false;
+
+  //! Structure for batch dispatch optimization - packets and kernel names in aligned memory
+  struct PacketBatch {
+    std::vector<uint8_t*> packets;
+    std::vector<std::string> kernelNames;
+    size_t capturedNodeCount;  // Number of consecutive captured nodes in this batch
+
+    PacketBatch() : capturedNodeCount(0) {}
+    PacketBatch(std::vector<uint8_t*>&& p, std::vector<std::string>&& k, size_t nodeCount)
+      : packets(std::move(p)), kernelNames(std::move(k)), capturedNodeCount(nodeCount) {}
+  };
+
+  //! Batches of accumulated packets and kernel names for batch dispatch optimization
+  //! Each batch contains packets from consecutive captured nodes
+  std::vector<PacketBatch> packetBatches_;
+  //! Track which nodes were successfully captured (true) vs need individual execution (false)
+  std::vector<bool> nodeCaptureStatus_;
 };
 
 class ChildGraphNode : public GraphNode, public GraphExec {
diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index c6660bc30e..25d7ea9825 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.
+/* Copyright (c) 2008 - 2025 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -1351,10 +1351,11 @@ class VirtualDevice : public amd::HeapObject {
   virtual bool isFenceDirty() const = 0;
   //! Init hidden heap for device memory allocations
   virtual void HiddenHeapInit() = 0;
-  //! Dispatch captured AQL packet
-  virtual bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
-                                 amd::AccumulateCommand* vcmd = nullptr) = 0;
 
+  //! Dispatches multiple AQL packets in a single batch operation
+  virtual bool dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
+                                      const std::vector<std::string>& kernelNames,
+                                      amd::AccumulateCommand* vcmd = nullptr) = 0 ;
   //! Returns the number of outstanding HSA async handlers
   std::atomic<uint64_t>& QueuedAsyncHandlers() const { return queued_async_handlers_; }
 
diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp
index dfb0b63632..115d929719 100644
--- a/projects/clr/rocclr/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/device/pal/palvirtual.hpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc.
+/* Copyright (c) 2015 - 2025 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -355,9 +355,10 @@ class VirtualGPU : public device::VirtualDevice {
 
   void HiddenHeapInit() {}
 
-  inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
-                                amd::AccumulateCommand* vcmd = nullptr) {
-    vcmd->addKernelName(kernelName);
+  //! Dispatches multiple AQL packets in a single batch operation
+  bool dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
+                              const std::vector<std::string>& kernelNames,
+                              amd::AccumulateCommand* vcmd = nullptr) {
     return false;
   }
 
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 8dbdcd794d..89153e20fc 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013 - 2023 Advanced Micro Devices, Inc.
+/* Copyright (c) 2013 - 2025 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -436,8 +436,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_va
 
   // Peep signal +2 ahead to see if its done
   auto temp_id = (current_id_ + 2) % signal_list_.size();
-  // If GPU is still busy with processing, then add more signals to avoid more frequent stalls
-  if (hsa_signal_load_relaxed(signal_list_[temp_id]->signal_) > 0) {
+  // If GPU is still busy with processing or if timestamps havent been saved out,
+  // then add more signals to avoid more frequent stalls
+  if (hsa_signal_load_relaxed(signal_list_[temp_id]->signal_) > 0 ||
+      !signal_list_[temp_id]->flags_.done_) {
     std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
     if ((signal != nullptr) && CreateSignal(signal.get())) {
       // Find valid new index
@@ -453,10 +455,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_va
   if (!new_signal) {
     // Find valid index
     ++current_id_ %= signal_list_.size();
-
     // Make sure the previous operation on the current signal is done
     WaitCurrent();
 
+    size_t next = (current_id_ + 1) % signal_list_.size();
     // Have to wait the next signal in the queue to avoid a race condition between
     // a GPU waiter(which may be not triggered yet) and CPU signal reset below
     WaitNext();
@@ -1009,7 +1011,6 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
 
   // Check for queue full and wait if needed.
   uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
-  uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
   fence_dirty_ = true;
 
   if (addSystemScope_) {
@@ -1055,16 +1056,16 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
     current_signal->flags_.isPacketDispatch_ = true;
   }
 
-
   // Make sure the slot is free for usage
   while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= sw_queue_size) {
-    amd::Os::yield();
+    // Active spin - no yield
   }
 
   // Add blocking command if the original value of read index was behind of the queue size.
   // Note: direct dispatch relies on the slot stall above to keep the forward progress
   // of the app if a dispatched kernel requires some CPU input for completion
-  if (blocking || (!AMD_DIRECT_DISPATCH && (index - read) >= sw_queue_size)) {
+  if (blocking || (!AMD_DIRECT_DISPATCH &&
+                   (index - hsa_queue_load_read_index_relaxed(gpu_queue_)) >= sw_queue_size)) {
     if (packet->completion_signal.handle == 0) {
       packet->completion_signal = Barriers().ActiveSignal();
     }
@@ -1102,7 +1103,8 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, ui
           reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
           reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
           reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
-          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2, read, index);
+          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
+          hsa_queue_load_read_index_scacquire(gpu_queue_), index);
 
   hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);
 
@@ -1158,34 +1160,229 @@ bool VirtualGPU::dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t he
 }
 
 // ================================================================================================
-inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
-                                          amd::AccumulateCommand* vcmd) {
-  if (vcmd == nullptr) {
+template <typename AqlPacket>
+bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& packets,
+                                               bool blocking, bool attach_signal,
+                                               const std::vector<std::string>* kernelNames) {
+  if (packets.empty()) {
     return false;
   }
 
-  vcmd->addKernelName(kernelName);
-  amd::ScopedLock lock(execution());
+  const uint32_t queueSize = gpu_queue_->size;
+  const uint32_t queueMask = queueSize - 1;
+  const uint32_t sw_queue_size = queueMask;
+  const size_t numPackets = packets.size();
+  size_t kMaxBatchSize = DEBUG_HIP_GRAPH_BATCH_SIZE;
+  const size_t kGpuLagPackets = 16;
 
+  // Staggered copy pattern: powers of 2 (1, 2, 4, 8.. to DEBUG_HIP_GRAPH_BATCH_SIZE
+  size_t processedPackets = 0;
+  size_t batchSize = 1;
+
+  while (processedPackets < numPackets) {
+    uint64_t currentReadIndex = hsa_queue_load_read_index_scacquire(gpu_queue_);
+    uint64_t currentWriteIndex = hsa_queue_load_write_index_relaxed(gpu_queue_);
+
+    if (currentWriteIndex - currentReadIndex >= kGpuLagPackets) {
+      //GPU is busy, so we can copy more packets
+      batchSize = DEBUG_HIP_GRAPH_BATCH_SIZE;
+    }
+
+    // Process all remaining packets in one batch
+    if (processedPackets + batchSize > numPackets) {
+      batchSize = numPackets - processedPackets;
+    }
+
+    // Check if we have enough space in the queue for this batch
+    // If queue is full, reset batch size to 1 and wait
+    if (currentWriteIndex + batchSize - currentReadIndex >= sw_queue_size) {
+      batchSize = 1;
+    }
+
+    // Now reserve space for the batch
+    uint64_t startIndex = hsa_queue_add_write_index_screlease(gpu_queue_, batchSize);
+
+    // Make sure the slot is free for usage
+    while ((startIndex - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= sw_queue_size) {
+      // Active spin - no yield
+    }
+
+    fence_dirty_ = true;
+
+    // Save header of first packet in this batch
+    AqlPacket* firstPacket = packets[processedPackets];
+    uint16_t firstPacketHeader = firstPacket->header;
+    uint16_t firstPacketRest = firstPacket->setup;
+
+    // Process batchSize packets
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t packetIndex = processedPackets + i;
+      uint64_t index = startIndex + i;
+
+      AqlPacket* packet = packets[packetIndex];
+      uint16_t header = packet->header;
+
+
+      bool attachSignal = timestamp_ != nullptr || attach_signal;
+
+      packet->completion_signal =
+          Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, attachSignal);
+
+      if (std::is_same<decltype(packet), hsa_kernel_dispatch_packet_t*>::value &&
+          timestamp_ != nullptr) {
+        // If profiling is enabled, store the correlation ID in the dispatch packet
+        if (amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) {
+          auto dispatchPacket = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet);
+          dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_;
+        }
+
+        ProfilingSignal* current_signal = Barriers().GetLastSignal();
+        current_signal->flags_.isPacketDispatch_ = true;
+      }
+
+      //Add blocking command if needed (only for the last packet)
+      if (blocking && (packetIndex == numPackets - 1)) {
+        if (packet->completion_signal.handle == 0) {
+          packet->completion_signal = Barriers().ActiveSignal();
+        }
+      }
+
+      AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];
+
+      // For first packet in batch, invalidate header before writing
+      if (i == 0) {
+        if (addSystemScope_) {
+          // Add system scope on the acq on first packet
+          firstPacketHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
+          firstPacketHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
+        }
+        packet->header = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE);
+
+        // Copy the packet and then write the valid of the first packet
+        *aql_loc = *packet;
+
+        // Restore the header of the first packet
+        packet->header = firstPacketHeader;
+      } else {
+        // For the end packet in batch set flags
+        if (i == batchSize - 1) {
+          if (addSystemScope_) {
+            // Add system scope on the release on last packet
+            packet->header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+            packet->header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+            addSystemScope_ = false;
+          }
+          auto expected_fence_state =
+              extractAqlBits(packet->header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                             HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
+          // Reset fence_dirty_ flag if we submit a packet with system scopes
+          if (expected_fence_state == amd::Device::kCacheStateSystem) {
+            fence_dirty_ = false;
+          }
+          fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
+        }
+
+        // Copy the packet to the queue
+        *aql_loc = *packet;
+      }
+
+      // Print kernel name for kernel dispatch packets
+      if (kernelNames && packetIndex < kernelNames->size()) {
+        uint8_t packetType =
+            extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE);
+        if (packetType == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
+          ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL, "Graph shader name : %s",
+                  (*kernelNames)[packetIndex].c_str());
+
+          ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
+                  "SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
+                  "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
+                  "setup=%d, grid=[%zu, %zu, %zu], workgroup=[%zu, %zu, %zu], "
+                  "private_seg_size=%zu, group_seg_size=%zu, kernel_obj=0x%zx, "
+                  "kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, "
+                  "rptr=%u, wptr=%u",
+                  gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header, packetType,
+                  extractAqlBits(header, HSA_PACKET_HEADER_BARRIER,
+                                HSA_PACKET_HEADER_WIDTH_BARRIER),
+                  extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
+                                HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
+                  extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                                HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
+                  packet->setup,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
+                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
+                  hsa_queue_load_read_index_scacquire(gpu_queue_), index);
+            }
+      }
+    }
+
+    // Write valid header for the first packet in the batch
+    AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[startIndex & queueMask];
+    packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), firstPacketHeader, firstPacketRest);
+
+    // Ring doorbell for this batch
+    hsa_signal_store_screlease(gpu_queue_->doorbell_signal, startIndex);
+
+    processedPackets += batchSize;
+
+    TrackQueueProgress(*packets[processedPackets - 1], startIndex + batchSize - 1);
+    // Double the batch size for next iteration, cap at DEBUG_HIP_GRAPH_BATCH_SIZE
+    if (batchSize < kMaxBatchSize) {
+      batchSize *= 2;
+    }
+  }
+
+  // Mark the flag indicating if a dispatch is outstanding
+  hasPendingDispatch_ = true;
+
+  // Wait on signal for the last packet if blocking
+  if (blocking) {
+    LogInfo("Runtime reached the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!");
+    if (!Barriers().WaitCurrent()) {
+      LogPrintfError("Failed blocking queue wait with signal [0x%lx]",
+                     packets.back()->completion_signal.handle);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// ================================================================================================
+bool VirtualGPU::dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
+                                        const std::vector<std::string>& kernelNames,
+                                        amd::AccumulateCommand* vcmd) {
+  if (vcmd == nullptr || packets.empty() || packets.size() != kernelNames.size()) {
+    return false;
+  }
+
+  amd::ScopedLock lock(execution());
   profilingBegin(*vcmd);
 
   dispatchBlockingWait();
-  auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
-  ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Graph shader name : %s", kernelName.c_str());
 
-  // The Aqlpacket with valid header will trigger the issue that AQL fill
-  // the header before filling the body. However, the CP can handle the AQL package
-  // after seeing the valid AQL header with the AQL package's body is NULL.
-  // This patch fixes this potential issue that filling AQL header before
-  // filling the AQL body.
-  uint16_t packetHeader = packet->header;
-  packet->header = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE);
-  dispatchGenericAqlPacket(packet, packetHeader, packet->setup, false);
-  packet->header = packetHeader;
+  // Add all kernel names in bulk
+  vcmd->addKernelNames(kernelNames);
+
+  // Dispatch all packets with a single doorbell ring
+  // Cast packets vector to AQL packets vector on the fly
+  const auto& aqlPackets =
+      reinterpret_cast<const std::vector<hsa_kernel_dispatch_packet_t*>&>(packets);
+  bool result = dispatchGenericAqlPacketBatch(aqlPackets, false, false, &kernelNames);
 
   profilingEnd();
 
-  return true;
+  return result;
 }
 
 // ================================================================================================
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index f12116e425..3ade98d3a7 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.
+/* Copyright (c) 2008 - 2025 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -468,16 +468,23 @@ class VirtualGPU : public device::VirtualDevice {
   //! Dispatches a barrier with blocking HSA signals
   void dispatchBlockingWait();
 
-  inline bool dispatchAqlPacket(uint8_t* aqlpacket, const std::string& kernelName,
-                                amd::AccumulateCommand* vcmd = nullptr);
   bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest,
                          bool blocking = true, bool capturing = false,
                          const uint8_t* aqlPacket = nullptr, bool attach_signal = false);
   bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest,
                          bool blocking = true, bool attach_signal = false);
+
+  //! Dispatches multiple AQL packets in a single batch operation
+  bool dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
+                              const std::vector<std::string>& kernelNames,
+                              amd::AccumulateCommand* vcmd = nullptr);
   template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
                                                               uint16_t rest, bool blocking,
                                                               bool attach_signal = false);
+  //! Dispatches multiple AQL packets with a single doorbell ring
+  template <typename AqlPacket> bool dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& packets,
+                                                                   bool blocking, bool attach_signal = false,
+                                                                   const std::vector<std::string>* kernelNames = nullptr);
 
   bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
                                 bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp
index 6070f94548..73ed76ea34 100644
--- a/projects/clr/rocclr/platform/command.hpp
+++ b/projects/clr/rocclr/platform/command.hpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010 - 2024 Advanced Micro Devices, Inc.
+/* Copyright (c) 2010 - 2025 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -1394,6 +1394,11 @@ class AccumulateCommand : public Command {
   //! Add kernel name to the list if available
   void addKernelName(const std::string& kernelName) { kernelNames_.push_back(kernelName); }
 
+  //! Add multiple kernel names in bulk
+  void addKernelNames(const std::vector<std::string>& kernelNames) {
+    kernelNames_.insert(kernelNames_.end(), kernelNames.begin(), kernelNames.end());
+  }
+
   //! Add kernel timestamp to the list if available
   void addTimestamps(uint64_t startTs, uint64_t endTs) {
     tsList_.push_back(std::make_pair(startTs, endTs));
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index 45248fbbbb..a8e8daa7f2 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2009 - 2021 Advanced Micro Devices, Inc.
+/* Copyright (c) 2009 - 2025 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -255,6 +255,8 @@ release(bool, DEBUG_HIP_FORCE_ASYNC_QUEUE, false,                             \
         "Forces grpahs into async queue mode. DEBUG_HIP_FORCE_GRAPH_QUEUES must be 1") \
 release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4,                                \
         "Forces the number of streams for the graph parallel execution")      \
+release(uint, DEBUG_HIP_GRAPH_BATCH_SIZE, 256,                                \
+        "Number of graph nodes to batch at a time")                           \
 release(uint, DEBUG_HIP_BLOCK_SYNC, 50,                                       \
         "Blocks synchronization on CPU until the callback processing is done")\
 release(uint, DEBUG_CLR_MAX_BATCH_SIZE, 1000,                                 \