diff --git a/rocclr/device/pal/palconstbuf.hpp b/rocclr/device/pal/palconstbuf.hpp
index 7acb8674c0..c9b72a4b40 100644
--- a/rocclr/device/pal/palconstbuf.hpp
+++ b/rocclr/device/pal/palconstbuf.hpp
@@ -65,6 +65,10 @@ class ManagedBuffer : public amd::EmbeddedObject {
   //! Returns VirtualGPU object this managed resource associated
   VirtualGPU& gpu() const { return gpu_; }
 
+  void CpuReadBack() const {
+    volatile auto tmp = *reinterpret_cast<uint64_t*>(pool_[activeBuffer_].buf->data());
+  }
+
  private:
   struct TimeStampedBuffer {
     Memory* buf;
diff --git a/rocclr/device/pal/palsettings.cpp b/rocclr/device/pal/palsettings.cpp
index a4902e95f1..a5d08c50be 100644
--- a/rocclr/device/pal/palsettings.cpp
+++ b/rocclr/device/pal/palsettings.cpp
@@ -143,7 +143,7 @@ Settings::Settings() {
   alwaysResident_ = amd::IS_HIP ? true : false;
   prepinnedMinSize_ = 0;
   cpDmaCopySizeMax_ = GPU_CP_DMA_COPY_SIZE * Ki;
-  useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? false : HIP_FORCE_DEV_KERNARG;
+  useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? true : HIP_FORCE_DEV_KERNARG;
 
   limit_blit_wg_ = 16;
 }
diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp
index b38c210de8..56b734bced 100644
--- a/rocclr/device/pal/palvirtual.cpp
+++ b/rocclr/device/pal/palvirtual.cpp
@@ -370,8 +370,9 @@ void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, b
 // ================================================================================================
 bool VirtualGPU::Queue::flush() {
   amd::ScopedLock l(lock_);
+  const Settings& settings = gpu_.dev().settings();
 
-  if (!gpu_.dev().settings().alwaysResident_ && palMemRefs_.size() != 0) {
+  if (!settings.alwaysResident_ && palMemRefs_.size() != 0) {
     if (Pal::Result::Success !=
         iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
                                       Pal::GpuMemoryRefCantTrim)) {
@@ -410,10 +411,15 @@ bool VirtualGPU::Queue::flush() {
   submitInfo.fenceCount           = 1;
   submitInfo.ppFences             = &iCmdFences_[cmdBufIdSlot_];
 
-  if (amd::IS_HIP) {
-    // HIP disables per resource tracking, because the app may embed SVM ptr into other buffers.
-    // Force CPU sync if there are pending operations on SDMA, until OS fences will be added
-    if (iQueue_->Type() == Pal::QueueTypeCompute) {
+  if (iQueue_->Type() == Pal::QueueTypeCompute) {
+    if (settings.useDeviceKernelArg_) {
+      // If runtime uses device memory for kernel arguments, then perform a CPU read back on
+      // submission. That will make sure NBIO puches all previous CPU write requests through PCIE
+      gpu_.managedBuffer().CpuReadBack();
+    }
+    if (amd::IS_HIP) {
+      // HIP disables per resource tracking, because the app may embed SVM ptr into other buffers.
+      // Force CPU sync if there are pending operations on SDMA, until OS fences will be added
       gpu_.WaitForIdleSdma();
     }
   }
@@ -487,7 +493,7 @@ bool VirtualGPU::Queue::flush() {
       }
     }
   }
-  if (!gpu_.dev().settings().alwaysResident_ && palMems_.size() != 0) {
+  if (!settings.alwaysResident_ && palMems_.size() != 0) {
     iDev_->RemoveGpuMemoryReferences(palMems_.size(), &palMems_[0], iQueue_);
     palMems_.clear();
   }