From 51a4d1865c394bc02d00a3ebb203a0e5b294cae9 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Tue, 29 May 2018 13:18:29 -0400
Subject: [PATCH] P4 to Git Change 1561015 by gandryey@gera-w8 on 2018/05/29
 13:10:06

	SWDEV-79445 - OCL generic changes and code clean-up
	Optimize memory dependency tracking logic:
	1. Add modified_ filed to the event object to track memory writes into device memory.
	2. Check memory dependency only if a write operation is currently requested or previously performed.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#35 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#66 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#100 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#52 edit
---
 rocclr/runtime/device/pal/paldefs.hpp     | 19 ++++++----
 rocclr/runtime/device/pal/palresource.cpp | 25 ++++++++++++
 rocclr/runtime/device/pal/palresource.hpp |  6 +++
 rocclr/runtime/device/pal/palvirtual.cpp  | 46 ++++++++++++-----------
 rocclr/runtime/device/pal/palvirtual.hpp  | 11 +++---
 5 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/rocclr/runtime/device/pal/paldefs.hpp b/rocclr/runtime/device/pal/paldefs.hpp
index d98f4d5000..d0018afd9e 100644
--- a/rocclr/runtime/device/pal/paldefs.hpp
+++ b/rocclr/runtime/device/pal/paldefs.hpp
@@ -44,22 +44,27 @@ struct HwDbgKernelInfo {
 enum EngineType { MainEngine = 0, SdmaEngine, AllEngines };
 
 struct GpuEvent {
-  static const unsigned int InvalidID = ((1 << 30) - 1);
+  static constexpr uint32_t InvalidID = ((1 << 30) - 1);
 
   struct {
-    uint32_t id : 31;         ///< actual event id
-    uint32_t engineId_ : 1;   ///< type of the id
+    uint32_t id_ : 30;        ///< Actual event id
+    uint32_t modified_ : 1;   ///< Resource associated with the event was modified
+    uint32_t engineId_ : 1;   ///< Type of the id
   };
   //! GPU event default constructor
-  GpuEvent() : id(InvalidID), engineId_(MainEngine) {}
+  GpuEvent() : id_(InvalidID), engineId_(MainEngine), modified_(false) {}
   //! GPU event constructor
-  GpuEvent(uint evt) : id(evt), engineId_(MainEngine) {}
+  GpuEvent(uint evt) : id_(evt), engineId_(MainEngine), modified_(false) {}
 
   //! Returns true if the current event is valid
-  bool isValid() const { return (id != InvalidID) ? true : false; }
+  bool isValid() const { return (id_ != InvalidID) ? true : false; }
 
   //! Set invalid event id
-  void invalidate() { id = InvalidID; }
+  void invalidate() { id_ = InvalidID; }
+
+  // Overwrite default assign operator to preserve modified_ field
+  GpuEvent& operator=(const GpuEvent& evt)
+    { id_ = evt.id_; engineId_ = evt.engineId_; return *this; }
 };
 
 /*! \addtogroup PAL
diff --git a/rocclr/runtime/device/pal/palresource.cpp b/rocclr/runtime/device/pal/palresource.cpp
index cb16e3a695..42c03d25cf 100644
--- a/rocclr/runtime/device/pal/palresource.cpp
+++ b/rocclr/runtime/device/pal/palresource.cpp
@@ -1665,6 +1665,31 @@ GpuEvent* Resource::getGpuEvent(const VirtualGPU& gpu) const {
   return &events_[idx];
 }
 
+// ================================================================================================
+void Resource::setModified(VirtualGPU& gpu, bool modified) const {
+  uint idx = gpu.index();
+  assert(idx < events_.size());
+  events_[idx].modified_ = modified;
+
+  // If current resource is a view, then update the parent as well
+  if (viewOwner_ != nullptr) {
+    viewOwner_->setModified(gpu, modified);
+  }
+}
+
+// ================================================================================================
+bool Resource::isModified(VirtualGPU& gpu) const {
+  uint idx = gpu.index();
+  assert(idx < events_.size());
+  bool modified = events_[idx].modified_;
+
+  // If current resource is a view, then get the parent state as well
+  if (viewOwner_ != nullptr) {
+    modified |= viewOwner_->isModified(gpu);
+  }
+  return modified;
+}
+
 // ================================================================================================
 void Resource::palFree() const {
   if (desc().type_ == OGLInterop) {
diff --git a/rocclr/runtime/device/pal/palresource.hpp b/rocclr/runtime/device/pal/palresource.hpp
index f25ed34e99..96f55c6f10 100644
--- a/rocclr/runtime/device/pal/palresource.hpp
+++ b/rocclr/runtime/device/pal/palresource.hpp
@@ -382,6 +382,12 @@ class Resource : public amd::HeapObject {
     }
   }
 
+  //! Update the modified field of the event, meaning the resource was updated
+  void setModified(VirtualGPU& gpu, bool modified) const;
+
+  //! Update the modified field of the event, meaning the resource was updated
+  bool isModified(VirtualGPU& gpu) const;
+
  protected:
   /*! \brief Creates a PAL iamge object, associated with the resource
   *
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index 683d5f6fa6..126310a7ce 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -427,24 +427,26 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
   uint64_t curStart = memory->vmAddress();
   uint64_t curEnd = curStart + memory->size();
 
-  // Loop through all memory objects in the queue and find dependency
-  // @note don't include objects from the current kernel
-  for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
-    // Check if the queue already contains this mem object and
-    // GPU operations aren't readonly
-    uint64_t busyStart = memObjectsInQueue_[j].start_;
-    uint64_t busyEnd = memObjectsInQueue_[j].end_;
+  if (memory->isModified(gpu) || !readOnly) {
+    // Loop through all memory objects in the queue and find dependency
+    // @note don't include objects from the current kernel
+    for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
+      // Check if the queue already contains this mem object and
+      // GPU operations aren't readonly
+      uint64_t busyStart = memObjectsInQueue_[j].start_;
+      uint64_t busyEnd = memObjectsInQueue_[j].end_;
 
-    // Check if the start inside the busy region
-    if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
-         // Check if the end inside the busy region
-         ((curEnd > busyStart) && (curEnd <= busyEnd)) ||
-         // Check if the start/end cover the busy region
-         ((curStart <= busyStart) && (curEnd >= busyEnd))) &&
-        // If the buys region was written or the current one is for write
-        (!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
-      flushL1Cache = true;
-      break;
+      // Check if the start inside the busy region
+      if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
+           // Check if the end inside the busy region
+           ((curEnd > busyStart) && (curEnd <= busyEnd)) ||
+           // Check if the start/end cover the busy region
+           ((curStart <= busyStart) && (curEnd >= busyEnd))) &&
+          // If the buys region was written or the current one is for write
+          (!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
+        flushL1Cache = true;
+        break;
+      }
     }
   }
 
@@ -471,6 +473,8 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
   memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd;
   memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly;
   numMemObjectsInQueue_++;
+  // Mark resource as modified
+  memory->setModified(gpu, !readOnly);
 }
 
 void VirtualGPU::MemoryDependency::clear(bool all) {
@@ -1955,7 +1959,7 @@ void VirtualGPU::PostDeviceEnqueue(
     uint64_t vmParentWrap,
     GpuEvent* gpuEvent)
 {
-  uint32_t id  = gpuEvent->id;
+  uint32_t id  = gpuEvent->id_;
   amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
 
   // Make sure exculsive access to the device queue
@@ -2036,7 +2040,7 @@ void VirtualGPU::PostDeviceEnqueue(
     iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
       vmParentWrap + offsetof(AmdAqlWrap, child_counter),
       signalAddr, dev().settings().useDeviceQueue_);
-    if (id != gpuEvent->id) {
+    if (id != gpuEvent->id_) {
         LogError("Something is wrong. ID mismatch!\n");
     }
     eventEnd(MainEngine, *gpuEvent);
@@ -2133,7 +2137,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
 
   for (int iter = 0; iter < iteration; ++iter) {
     GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId());
-    uint32_t id = gpuEvent.id;
+    uint32_t id = gpuEvent.id_;
     // Reset global size for dimension dim if split is needed
     if (dim != -1) {
       newOffset[dim] = sizes.offset()[dim] + globalStep * iter;
@@ -2184,7 +2188,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
     if (profiling() || state_.profileEnabled_) {
       addBarrier();
     }
-    if (id != gpuEvent.id) {
+    if (id != gpuEvent.id_) {
       LogError("Something is wrong. ID mismatch!\n");
     }
     eventEnd(MainEngine, gpuEvent);
diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp
index 9ca48d151a..fa48024c9d 100644
--- a/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/rocclr/runtime/device/pal/palvirtual.hpp
@@ -463,11 +463,11 @@ class VirtualGPU : public device::VirtualDevice {
     constexpr bool End = false;
     if (forceExec) {
       constexpr bool ForceFlush = true;
-      event.id = queues_[engId]->submit(ForceFlush);
+      event.id_ = queues_[engId]->submit(ForceFlush);
       profileEvent(engId, End);
     } else {
       profileEvent(engId, End);
-      event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION);
+      event.id_ = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION);
     }
     event.engineId_ = engId;
   }
@@ -475,7 +475,7 @@ class VirtualGPU : public device::VirtualDevice {
   void waitForEvent(GpuEvent* event) const {
     if (event->isValid()) {
       assert(event->engineId_ < AllEngines);
-      queues_[event->engineId_]->waitForEvent(event->id);
+      queues_[event->engineId_]->waitForEvent(event->id_);
       event->invalidate();
     }
   }
@@ -483,7 +483,7 @@ class VirtualGPU : public device::VirtualDevice {
   bool isDone(GpuEvent* event) {
     if (event->isValid()) {
       assert(event->engineId_ < AllEngines);
-      if (queues_[event->engineId_]->isDone(event->id)) {
+      if (queues_[event->engineId_]->isDone(event->id_)) {
         event->invalidate();
         return true;
       }
@@ -623,9 +623,8 @@ class VirtualGPU : public device::VirtualDevice {
 };
 
 inline void VirtualGPU::addVmMemory(const Memory* memory) {
-  GpuEvent event(queues_[MainEngine]->cmdBufId());
   queues_[MainEngine]->addCmdMemRef(memory->memRef());
-  memory->setBusy(*this, event);
+  memory->setBusy(*this, queues_[MainEngine]->cmdBufId());
 }
 
 inline void VirtualGPU::AddKernel(const amd::Kernel& kernel) const {