From 5cc4a44ce50a284fe281a892ac2dcf9c7e535dc4 Mon Sep 17 00:00:00 2001
From: foreman <dl.swbuild@amd.com>
Date: Fri, 22 Nov 2019 15:53:12 -0500
Subject: [PATCH] P4 to Git Change 2035516 by gandryey@gera-hip-lnx on
 2019/11/22 15:51:48

	SWDEV-79445 - OCL generic changes and code clean-up
	- Don't sync on the scratch buffer if the executed kernel is unchanged, since the number of scratch regs remains the same

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#158 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#65 edit


[ROCm/clr commit: dd5459c7a194a9011ed1996180ec93791ecbbba7]
---
 .../clr/rocclr/runtime/device/pal/palvirtual.cpp     | 12 ++++++++----
 .../clr/rocclr/runtime/device/pal/palvirtual.hpp     |  8 ++++++++
 2 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index c77a38b55d..1b16da57eb 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -2379,9 +2379,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
     return false;
   }
 
-  // Add ISA memory object to the resource tracking list
-  AddKernel(kernel);
-
   uint64_t vmDefQueue = 0;
   VirtualGPU* gpuDefQueue = nullptr;
   if (hsaKernel.dynamicParallelism()) {
@@ -2400,6 +2397,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
     return false;
   }
 
+  // Add ISA memory object to the resource tracking list
+  AddKernel(kernel);
+
   bool needFlush = false;
   // Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
   // are in the same cmdBuffer
@@ -3507,7 +3507,11 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
     const Device::ScratchBuffer* scratch = dev().scratch(hwRing());
     // Validate scratch buffer to force sync mode, because
     // the current scratch logic is optimized for size and performance
-    memoryDependency().validate(*this, scratch->memObj_, IsReadOnly);
+    // Note: runtime can skip sync if the same kernel is used,
+    // since the number of scratch regs remains the same
+    if (!IsSameKernel(kernel)) {
+       memoryDependency().validate(*this, scratch->memObj_, IsReadOnly);
+    }
     addVmMemory(scratch->memObj_);
   }
 
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index 006251c6b4..5d22ef0482 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -377,6 +377,10 @@ class VirtualGPU : public device::VirtualDevice {
   inline void AddKernel(const amd::Kernel& kernel  //!< AMD kernel object
                         ) const;
 
+  //! Checks if runtime dispatches the same kernel as previously
+  inline bool IsSameKernel(const amd::Kernel& kernel  //!< AMD kernel object
+                           ) const;
+
   //! Adds a dopp desktop texture reference
   void addDoppRef(const Memory* memory,  //!< GPU memory object
                   bool lastDoopCmd,      //!< is the last submission for the pre-present primary
@@ -652,6 +656,10 @@ inline void VirtualGPU::AddKernel(const amd::Kernel& kernel) const {
   queues_[MainEngine]->last_kernel_ = &kernel;
 }
 
+inline bool VirtualGPU::IsSameKernel(const amd::Kernel& kernel) const {
+  return (queues_[MainEngine]->last_kernel_ == &kernel) ? true : false;
+}
+
 template <bool avoidBarrierSubmit> uint VirtualGPU::Queue::submit(bool forceFlush) {
   cmdCnt_++;
   uint id = cmdBufIdCurrent_;