From dd4d5dcb9453399dddeeae42f09cd7f25fecec01 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Thu, 26 Apr 2018 11:13:29 -0400
Subject: [PATCH] P4 to Git Change 1546657 by gandryey@gera-w8 on 2018/04/26
 10:59:34

	SWDEV-151739 - [CQE OCL][DTB][Perf][QR][DTB-BLOCKER][VEGA10] Upto 18% performance drop observed while running Video Composition test sub test of Compubench due to faulty CL#1544622
	- Implement customized TS tracking for managed buffers. The common TS tracking mechanism saves the event of the last command, assuming SDMA and compute operations occur in order, but for managed buffers it's not the case. Also managed buffer doesn't have to validate TS for the parent resource.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#21 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#22 edit


[ROCm/clr commit: 79ba5904dc09d96a186b72b001ade3de4f8f1ab8]
---
 .../clr/rocclr/runtime/device/pal/palblit.cpp |  2 +-
 .../rocclr/runtime/device/pal/palconstbuf.cpp | 74 +++++++++----------
 .../rocclr/runtime/device/pal/palconstbuf.hpp | 18 +++--
 .../rocclr/runtime/device/pal/palmemory.hpp   |  1 +
 .../rocclr/runtime/device/pal/palresource.hpp |  1 +
 5 files changed, 52 insertions(+), 44 deletions(-)

diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
index e392c018f2..542028f300 100644
--- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
@@ -267,7 +267,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
     chunkSize = gpu().xferWrite().MaxSize();
   } else {
     chunkSize = std::min(amd::alignUp(xferSize / 4, 256), gpu().xferWrite().MaxSize());
-    chunkSize = std::max(chunkSize, 128 * Ki);
+    chunkSize = std::max(chunkSize, 64 * Ki);
     bool flushDMA = true;
   }
 
diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
index c0dfd536e2..ccd6dfb583 100644
--- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
@@ -11,32 +11,34 @@ namespace pal {
 
 // ================================================================================================
 ManagedBuffer::ManagedBuffer(VirtualGPU& gpu, uint32_t size)
-    : gpu_(gpu)
-    , buffers_(MaxNumberOfBuffers)
-    , activeBuffer_(0)
-    , size_(size)
-    , wrtOffset_(0)
-    , wrtAddress_(nullptr) {}
+  : gpu_(gpu)
+  , pool_(MaxNumberOfBuffers)
+  , activeBuffer_(0)
+  , size_(size)
+  , wrtOffset_(0)
+  , wrtAddress_(nullptr) {}
 
 // ================================================================================================
 void ManagedBuffer::release() {
-  for (auto it : buffers_) {
-    if ((it != nullptr) && (it->data() != nullptr)) {
-      it->unmap(&gpu_);
+  for (auto it : pool_) {
+    if ((it.buf != nullptr) && (it.buf->data() != nullptr)) {
+      it.buf->unmap(&gpu_);
     }
-    delete it;
+    delete it.buf;
   }
 }
 
 // ================================================================================================
 bool ManagedBuffer::create(Resource::MemoryType type) {
-  for (uint i = 0; i < buffers_.size(); ++i) {
-    buffers_[i] = new Memory(const_cast<pal::Device&>(gpu_.dev()), size_);
-    if (nullptr == buffers_[i] || !buffers_[i]->create(type)) {
+  for (uint i = 0; i < pool_.size(); ++i) {
+    pool_[i].buf = new Memory(const_cast<pal::Device&>(gpu_.dev()), size_);
+    if (nullptr == pool_[i].buf || !pool_[i].buf->create(type)) {
       LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_);
       return false;
     }
-    void* wrtAddress = buffers_[i]->map(&gpu_);
+    // Assign virtual gpu to the allocation. Buffer will be used only on a particular queue
+    pool_[i].buf->memRef()->gpu_ = &gpu_;
+    void* wrtAddress = pool_[i].buf->map(&gpu_);
     if (wrtAddress == nullptr) {
         LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
         return false;
@@ -45,9 +47,9 @@ bool ManagedBuffer::create(Resource::MemoryType type) {
     uint dummy = 0;
     static constexpr bool Wait = true;
     // Write 0 for the buffer paging by VidMM
-    buffers_[i]->writeRawData(gpu_, 0, sizeof(dummy), &dummy, Wait);
+    pool_[i].buf->writeRawData(gpu_, 0, sizeof(dummy), &dummy, Wait);
   }
-  wrtAddress_ = buffers_[activeBuffer_]->data();
+  wrtAddress_ = pool_[activeBuffer_].buf->data();
   return true;
 }
 
@@ -59,18 +61,22 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) {
   // Align reserve size on the vector's boundary
   uint32_t count = amd::alignUp(size, MemAlignment);
 
+  // Save previous event
+  pinGpuEvent();
+
   // Check if buffer has enough space for reservation
   if ((wrtOffset_ + count) > size_) {
     // Get the next buffer in the list
     ++activeBuffer_;
     activeBuffer_ %= MaxNumberOfBuffers;
     // Make sure the buffer isn't busy
-    buffers_[activeBuffer_]->wait(gpu_);
-    wrtAddress_ = buffers_[activeBuffer_]->data();
+    gpu().waitForEvent(&pool_[activeBuffer_].events[SdmaEngine]);
+    gpu().waitForEvent(&pool_[activeBuffer_].events[MainEngine]);
+    wrtAddress_ = pool_[activeBuffer_].buf->data();
     wrtOffset_ = 0;
   }
 
-  *gpu_address = buffers_[activeBuffer_]->vmAddress() + wrtOffset_;
+  *gpu_address = pool_[activeBuffer_].buf->vmAddress() + wrtOffset_;
   address cpu_address = wrtAddress_ + wrtOffset_;
 
   // Adjust the offset by the reserved size
@@ -80,23 +86,17 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) {
 }
 
 // ================================================================================================
-Memory& ManagedBuffer::reserveAtTheTop(uint32_t size)
-{
-  // Get the next buffer in the list
-  ++activeBuffer_;
-  activeBuffer_ %= MaxNumberOfBuffers;
-  // Make sure the buffer isn't busy
-  buffers_[activeBuffer_]->wait(gpu_);
-  wrtAddress_ = buffers_[activeBuffer_]->data();
-  wrtOffset_ = 0;
-  return *buffers_[activeBuffer_];
+void ManagedBuffer::pinGpuEvent() {
+  GpuEvent* event = activeMemory()->getGpuEvent(gpu());
+  pool_[activeBuffer_].events[event->engineId_] = *event;
+  activeMemory()->setBusy(gpu(), GpuEvent::InvalidID);
 }
 
 // ================================================================================================
 ConstantBuffer::ConstantBuffer(ManagedBuffer& mbuf, uint32_t size)
-    : mbuf_(mbuf)
-    , sys_mem_copy_(nullptr)
-    , size_(size)
+  : mbuf_(mbuf)
+  , sys_mem_copy_(nullptr)
+  , size_(size)
 {}
 
 // ================================================================================================
@@ -127,11 +127,11 @@ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const {
 
 // ================================================================================================
 uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const {
-    uint64_t  vm_address;
-    address   cpu_address = mbuf_.reserve(size, &vm_address);
-    // Update memory with new CB data
-    memcpy(cpu_address, sysmem, size);
-    return vm_address;
+  uint64_t  vm_address;
+  address   cpu_address = mbuf_.reserve(size, &vm_address);
+  // Update memory with new CB data
+  memcpy(cpu_address, sysmem, size);
+  return vm_address;
 }
 
 // ================================================================================================
diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
index da1984f636..c1853b0537 100644
--- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
@@ -30,9 +30,6 @@ class ManagedBuffer : public amd::EmbeddedObject {
   address reserve(uint32_t size,  //!< real data size for upload
                   uint64_t* gpu_address);
 
-  //! Reserves memory at the top of the active buffer
-  Memory& reserveAtTheTop(uint32_t size);
-
   //! Returns CB size
   uint32_t size() const { return size_; }
 
@@ -40,14 +37,23 @@ class ManagedBuffer : public amd::EmbeddedObject {
   uint32_t wrtOffset() const { return wrtOffset_; }
 
   //! Returns active GPU buffer
-  Memory* activeMemory() const { return buffers_[activeBuffer_]; }
+  Memory* activeMemory() const { return pool_[activeBuffer_].buf; }
 
-  uint64_t vmAddress() const { return buffers_[activeBuffer_]->vmAddress(); }
+  //! Retruns VM address for the active buffer
+  uint64_t vmAddress() const { return pool_[activeBuffer_].buf->vmAddress(); }
+
+  //! Update the timestamp for the HW operation
+  void pinGpuEvent();
 
   //! Returns VirtualGPU object this managed resource associated
   VirtualGPU& gpu() const { return gpu_; }
 
  private:
+  struct TimeStampedBuffer {
+    Memory*   buf;
+    GpuEvent  events[AllEngines];
+  };
+
   //! The maximum number of the managed buffers
   static constexpr uint32_t MaxNumberOfBuffers = 3;
 
@@ -58,7 +64,7 @@ class ManagedBuffer : public amd::EmbeddedObject {
   ManagedBuffer& operator=(const ManagedBuffer&) = delete;
 
   VirtualGPU& gpu_;                 //!< Virtual GPU object
-  std::vector<Memory*>  buffers_;   //!< Buffers for management
+  std::vector<TimeStampedBuffer>  pool_;   //!< Buffers for management
   uint32_t  activeBuffer_;          //!< Current active buffer
   uint32_t  size_;                  //!< Constant buffer size
   uint32_t  wrtOffset_;             //!< Current write offset
diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
index 6825c8b8c5..fbb95bcc80 100644
--- a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
@@ -145,6 +145,7 @@ class Memory : public device::Memory, public Resource {
   //! Quick view update for managed buffers. It should avoid expensive object allocations
   void updateView(Resource* view, size_t offset, size_t size) {
     size_ = size;
+    flags_ |= HostMemoryDirectAccess;
     Resource::updateView(view, offset, size);
   }
 
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
index a838f2673a..3ae73d7ec6 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
@@ -378,6 +378,7 @@ class Resource : public amd::HeapObject {
       memRef_->retain();
       desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) /
         Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
+      setBusy(*memRef()->gpu_, GpuEvent::InvalidID);
     }
   }