From caad6f5ccec769d6ea279a8daeff97a8189f329e Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 24 Aug 2017 17:45:19 -0400
Subject: [PATCH] P4 to Git Change 1451494 by gandryey@gera-w8 on 2017/08/24
17:34:58
SWDEV-129129 - [[CQE OCL][Vega vs Fiji] Upto 12% Performance drop observed on VEGA10 compared to FIJI while running BlackMagic Davinci Resolve
- Remove some debug logic (resident_ field) from CL#1451293. It caused some CPU overhead
- Use a template for waifForFence() to avoid some calls when unnecessary
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#15 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#31 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#53 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#29 edit
---
rocclr/runtime/device/pal/palmemory.cpp | 1 -
rocclr/runtime/device/pal/palresource.cpp | 11 ++++++-----
rocclr/runtime/device/pal/palresource.hpp | 1 -
rocclr/runtime/device/pal/palvirtual.cpp | 23 +++++++----------------
rocclr/runtime/device/pal/palvirtual.hpp | 15 +++++++++++----
5 files changed, 24 insertions(+), 27 deletions(-)
diff --git a/rocclr/runtime/device/pal/palmemory.cpp b/rocclr/runtime/device/pal/palmemory.cpp
index f417e072fc..6522677163 100644
--- a/rocclr/runtime/device/pal/palmemory.cpp
+++ b/rocclr/runtime/device/pal/palmemory.cpp
@@ -170,7 +170,6 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params
}
if (result) {
- dev().addResource(memRef());
if (params != nullptr) {
memRef()->gpu_ = params->gpu_;
}
diff --git a/rocclr/runtime/device/pal/palresource.cpp b/rocclr/runtime/device/pal/palresource.cpp
index 9c75bd3553..9448a1825e 100644
--- a/rocclr/runtime/device/pal/palresource.cpp
+++ b/rocclr/runtime/device/pal/palresource.cpp
@@ -46,6 +46,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
}
// Update free memory size counters
const_cast(dev).updateFreeMemory(createInfo.heaps[0], createInfo.size, false);
+ dev.addResource(memRef);
return memRef;
}
@@ -68,6 +69,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
}
// Update free memory size counters
const_cast(dev).updateFreeMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, false);
+ dev.addResource(memRef);
return memRef;
}
@@ -90,6 +92,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
// Update free memory size counters
const_cast(dev).updateFreeMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size,
false);
+ dev.addResource(memRef);
return memRef;
}
@@ -111,6 +114,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
return nullptr;
}
}
+ dev.addResource(memRef);
return memRef;
}
@@ -137,11 +141,12 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
return nullptr;
}
}
+ dev.addResource(memRef);
return memRef;
}
GpuMemoryReference::GpuMemoryReference(const Device& dev)
- : gpuMem_(nullptr), cpuAddress_(nullptr), events_(dev.numOfVgpus()), device_(dev), gpu_(nullptr), resident_(0) {}
+ : gpuMem_(nullptr), cpuAddress_(nullptr), events_(dev.numOfVgpus()), device_(dev), gpu_(nullptr) {}
GpuMemoryReference::~GpuMemoryReference() {
if (gpu_ == nullptr) {
@@ -162,10 +167,6 @@ GpuMemoryReference::~GpuMemoryReference() {
device_.vgpus()[0]->releaseMemory(this, &events_[0]);
}
- if (resident_ != 0) {
- LogError("Residency counter isn't 0 on memory destroy!");
- }
-
{
amd::ScopedLock lk(device_.lockPAL());
if (cpuAddress_ != nullptr) {
diff --git a/rocclr/runtime/device/pal/palresource.hpp b/rocclr/runtime/device/pal/palresource.hpp
index 36156683af..e7f91440fd 100644
--- a/rocclr/runtime/device/pal/palresource.hpp
+++ b/rocclr/runtime/device/pal/palresource.hpp
@@ -51,7 +51,6 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
//! @note: This field is necessary for the thread safe release only
VirtualGPU* gpu_; //!< Resource will be used only on this queue
std::vector events_; //!< GPU events associated with the resource
- std::atomic resident_; //!< Atomic counter for residency
protected:
//! Default destructor
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index aea220fe4e..29d2bd0c63 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -163,7 +163,6 @@ void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
palSdiRefs_.push_back(iMem);
}
residency_size_ += iMem->Desc().size;
- mem->resident_++;
}
}
@@ -172,7 +171,6 @@ void VirtualGPU::Queue::removeCmdMemRef(GpuMemoryReference* mem) {
if (0 != memReferences_.erase(mem)) {
iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
residency_size_ -= iMem->Desc().size;
- mem->resident_--;
}
}
@@ -222,13 +220,6 @@ bool VirtualGPU::Queue::flush() {
return false;
}
- // Validate resources
- for (auto it : memReferences_) {
- if (it.second == cmdBufIdSlot_) {
- assert(it.first->resident_ > 0 && "Unresident resource!");
- }
- }
-
if (palMemRefs_.size() != 0) {
if (Pal::Result::Success !=
iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
@@ -259,8 +250,10 @@ bool VirtualGPU::Queue::flush() {
LogError("PAL failed to submit CMD!");
return false;
}
+ // Make sure the slot isn't busy
+ constexpr bool IbReuse = true;
if (GPU_FLUSH_ON_EXECUTION) {
- waifForFence(cmdBufIdSlot_);
+ waifForFence(cmdBufIdSlot_);
}
// Reset the counter of commands
@@ -271,7 +264,7 @@ bool VirtualGPU::Queue::flush() {
if (cmdBufIdCurrent_ == GpuEvent::InvalidID) {
// Wait for the last one
- waifForFence(cmdBufIdSlot_);
+ waifForFence(cmdBufIdSlot_);
cmdBufIdCurrent_ = 1;
cmbBufIdRetired_ = 0;
}
@@ -279,9 +272,7 @@ bool VirtualGPU::Queue::flush() {
// Wrap current slot
cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers;
- // Make sure the slot isn't busy
- constexpr bool IbReuse = true;
- waifForFence(cmdBufIdSlot_, IbReuse);
+ waifForFence(cmdBufIdSlot_);
// Progress retired TS
if ((cmdBufIdCurrent_ > MaxCmdBuffers) &&
@@ -312,7 +303,6 @@ bool VirtualGPU::Queue::flush() {
if (it->second == cmdBufIdSlot_) {
palMems_.push_back(it->first->iMem());
residency_size_ -= it->first->iMem()->Desc().size;
- it->first->resident_--;
it = memReferences_.erase(it);
} else {
++it;
@@ -333,7 +323,8 @@ bool VirtualGPU::Queue::waitForEvent(uint id) {
}
uint slotId = id % MaxCmdBuffers;
- bool result = waifForFence(slotId);
+ constexpr bool IbReuse = true;
+ bool result = waifForFence(slotId);
cmbBufIdRetired_ = id;
return result;
}
diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp
index 19f93202da..15c375a23b 100644
--- a/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/rocclr/runtime/device/pal/palvirtual.hpp
@@ -96,15 +96,22 @@ class VirtualGPU : public device::VirtualDevice {
}
// ibReuse forces event wait without polling, to make sure event occured
- bool waifForFence(uint cbId, bool ibReuse = false) const {
+ template
+ bool waifForFence(uint cbId) const {
Pal::Result result = Pal::Result::Success;
- uint64_t start = amd::Os::timeNanos();
- while ((Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus())) || ibReuse) {
+ uint64_t start;
+ uint64_t end;
+ if (!ibReuse) {
+ start = amd::Os::timeNanos();
+ }
+ while (ibReuse || (Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus()))) {
if (result == Pal::Result::ErrorFenceNeverSubmitted) {
result = Pal::Result::Success;
break;
}
- uint64_t end = amd::Os::timeNanos();
+ if (!ibReuse) {
+ end = amd::Os::timeNanos();
+ }
if (!ibReuse && ((end - start) < PollIntervalInNsec)) {
amd::Os::yield();
continue;