diff --git a/rocclr/runtime/device/devkernel.cpp b/rocclr/runtime/device/devkernel.cpp
index 38862dda44..33438dfc8f 100644
--- a/rocclr/runtime/device/devkernel.cpp
+++ b/rocclr/runtime/device/devkernel.cpp
@@ -799,6 +799,11 @@ void Kernel::InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize) {
 
     // Allocate the hidden arguments, but abstraction layer will skip them
     if (isHidden) {
+
+      if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::HiddenCompletionAction) {
+        setDynamicParallelFlag(true);
+      }
+
       offset = amd::alignUp(offset, alignment);
       desc.offset_ = offset;
       desc.size_ = size;
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 95ac784907..a16eef782b 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -2179,15 +2179,24 @@ bool Device::createBlitProgram() {
   // Delayed compilation due to brig_loader memory allocation
   const char* scheduler = nullptr;
   const char* ocl20 = nullptr;
-#if !defined(WITH_LIGHTNING_COMPILER)
+
   std::string sch = SchedulerSourceCode;
   if (settings().oclVersion_ >= OpenCL20) {
     size_t loc = sch.find("%s");
     sch.replace(loc, 2, iDev()->GetDispatchKernelSource());
+#if defined(WITH_LIGHTNING_COMPILER)
+    // For LC, replace "amd_scheduler" with "amd_scheduler_pal"
+    static const char AmdScheduler[] = "amd_scheduler";
+    static const char AmdSchedulerPal[] = "amd_scheduler_pal";
+    loc = sch.find(AmdScheduler);
+    sch.replace(loc, strlen(AmdScheduler), AmdSchedulerPal);
+    loc = sch.find(AmdScheduler, (loc + strlen(AmdSchedulerPal)));
+    sch.replace(loc, strlen(AmdScheduler), AmdSchedulerPal);
+#endif
     scheduler = sch.c_str();
     ocl20 = "-cl-std=CL2.0";
   }
-#endif  // !defined(WITH_LIGHTNING_COMPILER)
+
   blitProgram_ = new BlitProgram(context_);
   // Create blit programs
   if (blitProgram_ == nullptr || !blitProgram_->create(this, scheduler, ocl20)) {
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index 2b2df120ed..504285f256 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -421,6 +421,26 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
     workGroupInfo_.compileVecTypeHint_ = kernelMD->mAttrs.mVecTypeHint.c_str();
   }
 
+  if (!kernelMD->mAttrs.mRuntimeHandle.empty()) {
+    hsa_agent_t agent;
+    agent.handle = 1;
+    amd::hsa::loader::Symbol* rth_symbol;
+
+    // Get the runtime handle symbol GPU address
+    rth_symbol = prog_.GetSymbol(const_cast<char*>(kernelMD->mAttrs.mRuntimeHandle.c_str()),
+                                const_cast<hsa_agent_t*>(&agent));
+    uint64_t symbol_address;
+    rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
+
+    // Copy the kernel_object pointer to the runtime handle symbol GPU address
+    const Memory& codeSegGpu = prog_.codeSegGpu();
+    uint64_t offset = symbol_address - codeSegGpu.vmAddress();
+    uint64_t kernel_object = gpuAqlCode();
+    VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
+
+    codeSegGpu.writeRawData(*gpu, offset, 8, &kernel_object, true);
+  }
+
   // Copy wavefront size
   workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
 
diff --git a/rocclr/runtime/device/pal/palprogram.cpp b/rocclr/runtime/device/pal/palprogram.cpp
index f1b920c257..2d5c57a3de 100644
--- a/rocclr/runtime/device/pal/palprogram.cpp
+++ b/rocclr/runtime/device/pal/palprogram.cpp
@@ -1592,11 +1592,6 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
         std::max(static_cast<uint>(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
   }
 
-  // Allocate kernel table for device enqueuing
-  if (!isNull() && false /*dynamicParallelism*/ && !allocKernelTable()) {
-    return false;
-  }
-
   // Get the list of global variables
   std::vector<std::string> glbVarNames;
   status = executable_->IterateSymbols(GetGlobalVarNamesCallback, &glbVarNames);
diff --git a/rocclr/runtime/device/pal/palprogram.hpp b/rocclr/runtime/device/pal/palprogram.hpp
index e7046e285c..e89b8f5ca4 100644
--- a/rocclr/runtime/device/pal/palprogram.hpp
+++ b/rocclr/runtime/device/pal/palprogram.hpp
@@ -181,6 +181,11 @@ class HSAILProgram : public device::Program {
   //! Global variables are a part of the code segment
   bool GlobalVariables() const { return globalVars_; }
 
+  //! Get symbol by name
+  amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t *agent) const {
+    return executable_->GetSymbol(symbol_name, agent);
+  }
+
  protected:
   //! pre-compile setup for GPU
   virtual bool initBuild(amd::option::Options* options);
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index 2a40468b6c..438b43fede 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -1894,18 +1894,10 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
       print << wraps[i].aql.grid_size_y << ", ";
       print << wraps[i].aql.grid_size_z << "]\n";
 
-      uint64_t* kernels =
-        (uint64_t*)(const_cast<Memory*>(hsaKernel.prog().kernelTable())->map(this));
-      for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) {
-        if (kernels[j] == wraps[i].aql.kernel_object) {
-          break;
-        }
-      }
-      const_cast<Memory*>(hsaKernel.prog().kernelTable())->unmap(this);
       HSAILKernel* child = nullptr;
       for (auto it = hsaKernel.prog().kernels().begin();
         it != hsaKernel.prog().kernels().end(); ++it) {
-        if (j == static_cast<HSAILKernel*>(it->second)->index()) {
+        if (wraps[i].aql.kernel_object == static_cast<HSAILKernel*>(it->second)->gpuAqlCode()) {
           child = static_cast<HSAILKernel*>(it->second);
         }
       }
@@ -1996,14 +1988,15 @@ bool VirtualGPU::PreDeviceEnqueue(
   }
   *vmDefQueue = (*gpuDefQueue)->virtualQueue_->vmAddress();
 
-  (*gpuDefQueue)->writeVQueueHeader(*this, hsaKernel.prog().kernelTable()->vmAddress());
+  (*gpuDefQueue)->writeVQueueHeader(*this, hsaKernel.prog().kernelTable());
+
   // Acquire USWC memory for the scheduler parameters
   (*gpuDefQueue)->schedParams_ = &xferWrite().Acquire(sizeof(SchedulerParam));
 
   // Add memory handles before the actual dispatch
   addVmMemory((*gpuDefQueue)->virtualQueue_);
   addVmMemory((*gpuDefQueue)->schedParams_);
-  addVmMemory(hsaKernel.prog().kernelTable());
+
   return true;
 }
 
@@ -3252,10 +3245,15 @@ amd::Memory* VirtualGPU::createBufferFromImage(amd::Memory& amdImage) {
   return mem;
 }
 
-void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) {
-  const static bool Wait = true;
-  vqHeader_->kernel_table = kernelTable;
-  virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait);
+void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, const Memory* kernelTable) {
+  if (nullptr == kernelTable) {
+    vqHeader_->kernel_table = 0;
+  } else {
+    vqHeader_->kernel_table = kernelTable->vmAddress();
+    addVmMemory(kernelTable);
+  }
+
+  virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, true);
 }
 
 void VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, hsa_kernel_dispatch_packet_t* aqlPkt,
diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp
index 221b98b36e..85fc889e6f 100644
--- a/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/rocclr/runtime/device/pal/palvirtual.hpp
@@ -423,7 +423,7 @@ class VirtualGPU : public device::VirtualDevice {
   Memory* vQueue() const { return virtualQueue_; }
 
   //! Update virtual queue header
-  void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);
+  void writeVQueueHeader(VirtualGPU& hostQ, const Memory* kernelTable);
 
   //! Returns TRUE if virtual queue was successfully allocatted
   bool createVirtualQueue(uint deviceQueueSize  //!< Device queue size