diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index b0a23a45fe..359331cc85 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -2100,7 +2100,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
   profilingBegin(vcmd);
 
   // Submit kernel to HW
-  if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event())) {
+  if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(), vcmd.sharedMemBytes())) {
     vcmd.setStatus(CL_INVALID_OPERATION);
   }
 
@@ -2110,7 +2110,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
 // ================================================================================================
 bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
                                       const_address parameters, bool nativeMem,
-                                      amd::Event* enqueueEvent)
+                                      amd::Event* enqueueEvent, uint32_t sharedMemBytes)
 {
   size_t newOffset[3] = { 0, 0, 0 };
   size_t newGlobalSize[3] = { 0, 0, 0 };
@@ -2232,7 +2232,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
     uint64_t vmParentWrap = 0;
     // Program the kernel arguments for the GPU execution
     hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
-      *this, kernel, tmpSizes, parameters, ldsSize, vmDefQueue, &vmParentWrap);
+      *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
     if (nullptr == aqlPkt) {
       LogError("Couldn't load kernel arguments");
       return false;
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index a7a6b38ee7..aff4163677 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -299,7 +299,8 @@ class VirtualGPU : public device::VirtualDevice {
       const amd::Kernel& kernel,           //!< Kernel for execution
       const_address parameters,            //!< Parameters for the kernel
       bool nativeMem = true,               //!< Native memory objects
-      amd::Event* enqueueEvent = nullptr   //!< Event provided in the enqueue kernel command
+      amd::Event* enqueueEvent = nullptr,  //!< Event provided in the enqueue kernel command
+      uint32_t sharedMemBytes = 0          //!< Shared memory size
       );
   void submitNativeFn(amd::NativeFnCommand& vcmd);
   void submitFillMemory(amd::FillMemoryCommand& vcmd);
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index 38cbde14c9..1c6567adfa 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -1938,7 +1938,7 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
 }
 
 bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
-                                      const_address parameters, void* eventHandle) {
+                                      const_address parameters, void* eventHandle, uint32_t sharedMemBytes) {
   device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
   Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
   size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
@@ -2113,7 +2113,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
     dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? local[2] : 1;
 
     dispatchPacket.kernarg_address = argBuffer;
-    dispatchPacket.group_segment_size = ldsUsage;
+    dispatchPacket.group_segment_size = ldsUsage + sharedMemBytes;
     dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
 
     // Dispatch the packet
@@ -2154,7 +2154,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
 
   // Submit kernel to HW
   if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
-                            static_cast<void*>(as_cl(&vcmd.event())))) {
+                            static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes())) {
     LogError("AQL dispatch failed!");
     vcmd.setStatus(CL_INVALID_OPERATION);
   }
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
index 0e9e18981b..7b7d9290b2 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
@@ -166,7 +166,8 @@ class VirtualGPU : public device::VirtualDevice {
   bool submitKernelInternal(const amd::NDRangeContainer& sizes,  //!< Workload sizes
                             const amd::Kernel& kernel,           //!< Kernel for execution
                             const_address parameters,            //!< Parameters for the kernel
-                            void* event_handle  //!< Handle to OCL event for debugging
+                            void* event_handle,  //!< Handle to OCL event for debugging
+                            uint32_t sharedMemBytes = 0 //!< Shared memory size
                             );
   void submitNativeFn(amd::NativeFnCommand& cmd);
   void submitMarker(amd::Marker& cmd);
diff --git a/projects/clr/rocclr/runtime/platform/command.cpp b/projects/clr/rocclr/runtime/platform/command.cpp
index 99873ac981..19556b9939 100644
--- a/projects/clr/rocclr/runtime/platform/command.cpp
+++ b/projects/clr/rocclr/runtime/platform/command.cpp
@@ -226,8 +226,8 @@ void Command::enqueue() {
 const Context& Command::context() const { return queue_->context(); }
 
 NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList,
-                                           Kernel& kernel, const NDRangeContainer& sizes)
-    : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList), kernel_(kernel), sizes_(sizes) {
+                                           Kernel& kernel, const NDRangeContainer& sizes, uint32_t sharedMemBytes)
+    : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList), kernel_(kernel), sizes_(sizes), sharedMemBytes_(sharedMemBytes) {
   auto& device = queue.device();
   auto devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(device));
   profilingInfo_.setCallback(devKernel->getProfilingCallback(
diff --git a/projects/clr/rocclr/runtime/platform/command.hpp b/projects/clr/rocclr/runtime/platform/command.hpp
index e979112ada..bad260289e 100644
--- a/projects/clr/rocclr/runtime/platform/command.hpp
+++ b/projects/clr/rocclr/runtime/platform/command.hpp
@@ -752,11 +752,12 @@ class NDRangeKernelCommand : public Command {
   Kernel& kernel_;
   NDRangeContainer sizes_;
   address parameters_;
+  uint32_t sharedMemBytes_;
 
  public:
   //! Construct an ExecuteKernel command
   NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel,
-                       const NDRangeContainer& sizes);
+                       const NDRangeContainer& sizes, uint32_t sharedMemBytes = 0);
 
   virtual void submit(device::VirtualDevice& device) { device.submitKernel(*this); }
 
@@ -772,6 +773,9 @@ class NDRangeKernelCommand : public Command {
   //! Return the kernel NDRange.
   const NDRangeContainer& sizes() const { return sizes_; }
 
+  //! Return the shared memory size
+  uint32_t sharedMemBytes() const { return sharedMemBytes_; }
+
   //! Set the local work size.
   void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }