From cfc07c88eeef5be596e4318831c25f0172725dd0 Mon Sep 17 00:00:00 2001
From: German <German.Andryeyev@amd.com>
Date: Fri, 8 Dec 2023 19:02:50 -0500
Subject: [PATCH] SWDEV-436796 - Enable device memory for kernel arguments

Extra CPU read back will be performed before every submission to make sure
previous writes over PCIE reached GPU. HDP flush is done by CP.

Change-Id: I402d28ca26c8cee4a3920feb3599af8c285d0889
---
 rocclr/device/pal/palconstbuf.hpp |  4 ++++
 rocclr/device/pal/palsettings.cpp |  2 +-
 rocclr/device/pal/palvirtual.cpp  | 18 ++++++++++++------
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/rocclr/device/pal/palconstbuf.hpp b/rocclr/device/pal/palconstbuf.hpp
index 7acb8674c0..c9b72a4b40 100644
--- a/rocclr/device/pal/palconstbuf.hpp
+++ b/rocclr/device/pal/palconstbuf.hpp
@@ -65,6 +65,10 @@ class ManagedBuffer : public amd::EmbeddedObject {
   //! Returns VirtualGPU object this managed resource associated
   VirtualGPU& gpu() const { return gpu_; }
 
+  void CpuReadBack() const {
+    volatile auto tmp = *reinterpret_cast<uint64_t*>(pool_[activeBuffer_].buf->data());
+  }
+
  private:
   struct TimeStampedBuffer {
     Memory* buf;
diff --git a/rocclr/device/pal/palsettings.cpp b/rocclr/device/pal/palsettings.cpp
index a4902e95f1..a5d08c50be 100644
--- a/rocclr/device/pal/palsettings.cpp
+++ b/rocclr/device/pal/palsettings.cpp
@@ -143,7 +143,7 @@ Settings::Settings() {
   alwaysResident_ = amd::IS_HIP ? true : false;
   prepinnedMinSize_ = 0;
   cpDmaCopySizeMax_ = GPU_CP_DMA_COPY_SIZE * Ki;
-  useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? false : HIP_FORCE_DEV_KERNARG;
+  useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? true : HIP_FORCE_DEV_KERNARG;
 
   limit_blit_wg_ = 16;
 }
diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp
index b38c210de8..56b734bced 100644
--- a/rocclr/device/pal/palvirtual.cpp
+++ b/rocclr/device/pal/palvirtual.cpp
@@ -370,8 +370,9 @@ void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, b
 // ================================================================================================
 bool VirtualGPU::Queue::flush() {
   amd::ScopedLock l(lock_);
+  const Settings& settings = gpu_.dev().settings();
 
-  if (!gpu_.dev().settings().alwaysResident_ && palMemRefs_.size() != 0) {
+  if (!settings.alwaysResident_ && palMemRefs_.size() != 0) {
     if (Pal::Result::Success !=
         iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
                                       Pal::GpuMemoryRefCantTrim)) {
@@ -410,10 +411,15 @@ bool VirtualGPU::Queue::flush() {
   submitInfo.fenceCount           = 1;
   submitInfo.ppFences             = &iCmdFences_[cmdBufIdSlot_];
 
-  if (amd::IS_HIP) {
-    // HIP disables per resource tracking, because the app may embed SVM ptr into other buffers.
-    // Force CPU sync if there are pending operations on SDMA, until OS fences will be added
-    if (iQueue_->Type() == Pal::QueueTypeCompute) {
+  if (iQueue_->Type() == Pal::QueueTypeCompute) {
+    if (settings.useDeviceKernelArg_) {
+      // If runtime uses device memory for kernel arguments, then perform a CPU read back on
+      // submission. That will make sure NBIO puches all previous CPU write requests through PCIE
+      gpu_.managedBuffer().CpuReadBack();
+    }
+    if (amd::IS_HIP) {
+      // HIP disables per resource tracking, because the app may embed SVM ptr into other buffers.
+      // Force CPU sync if there are pending operations on SDMA, until OS fences will be added
       gpu_.WaitForIdleSdma();
     }
   }
@@ -487,7 +493,7 @@ bool VirtualGPU::Queue::flush() {
       }
     }
   }
-  if (!gpu_.dev().settings().alwaysResident_ && palMems_.size() != 0) {
+  if (!settings.alwaysResident_ && palMems_.size() != 0) {
     iDev_->RemoveGpuMemoryReferences(palMems_.size(), &palMems_[0], iQueue_);
     palMems_.clear();
   }