From a12776ba659522482daecf9b55da028be2e9a26c Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 14 Sep 2017 11:58:52 -0400
Subject: [PATCH] P4 to Git Change 1458879 by gandryey@gera-w8 on 2017/09/14
11:41:44
SWDEV-129129 - [[CQE OCL][Vega vs Fiji] Upto 12% Performance drop observed on VEGA10 compared to FIJI while running BlackMagic Davinci Resolve
More benchmark tuning:
- Keep system memory locked in the resource cache. That removes huge amount of lock/unlock calls to OS due to the resource creation and destruciton
- Reduce the command buffer size to 256 commands and incrrease the amount of CBs to 16
- Increase the amount of resident resources to 2048
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#574 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#37 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#58 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#31 edit
[ROCm/clr commit: 4066449a8b3594c67bedb7d40392be55c8ca826d]
---
.../rocclr/runtime/device/gpu/gpudevice.cpp | 6 ++++
.../rocclr/runtime/device/pal/palmemory.cpp | 3 +-
.../rocclr/runtime/device/pal/palresource.cpp | 32 ++++++++++++++++---
.../rocclr/runtime/device/pal/palvirtual.cpp | 2 +-
.../rocclr/runtime/device/pal/palvirtual.hpp | 4 +--
5 files changed, 39 insertions(+), 8 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
index 5baa2bca91..b6d3b77974 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -1337,6 +1337,12 @@ gpu::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const {
(owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER))
? Resource::Remote
: Resource::Local;
+
+ // Check if runtime can force a tiny buffer into USWC memory
+ if ((size <= (GPU_MAX_REMOTE_MEM_SIZE * Ki)) && (type == Resource::Local) &&
+ (owner.getMemFlags() & CL_MEM_READ_ONLY)) {
+ type = Resource::RemoteUSWC;
+ }
if (owner.getMemFlags() & CL_MEM_BUS_ADDRESSABLE_AMD) {
type = Resource::BusAddressable;
diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp
index 3796aa2e9a..bdb7687f54 100644
--- a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp
@@ -416,7 +416,8 @@ Memory::~Memory() {
if ((owner() != nullptr) && isHostMemDirectAccess() && !(flags_ & SubMemoryObject) &&
(memoryType() != Resource::ExternalPhysical)) {
// Unmap memory if direct access was requested
- unmap(nullptr);
+ // Note: runtime will perform unmap on the actual resource destruction
+ //unmap(nullptr);
}
}
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
index f06b85268d..36a971ec85 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
@@ -744,6 +744,12 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
}
}
}
+ // Check if memory is locked already and restore CPU pointer
+ if (memRef_->cpuAddress_ != nullptr) {
+ address_ = memRef_->cpuAddress_;
+ memRef_->cpuAddress_ = nullptr;
+ mapCount_++;
+ }
Pal::BufferViewInfo viewInfo = {};
viewInfo.gpuAddr = memRef_->iMem()->Desc().gpuVirtAddr + offset();
viewInfo.range = memRef_->iMem()->Desc().size;
@@ -890,6 +896,12 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
LogWarning("Image is bigger than the original mem object!");
}
}
+ // Check if memory is locked already and restore CPU pointer
+ if (memRef_->cpuAddress_ != nullptr) {
+ address_ = memRef_->cpuAddress_;
+ memRef_->cpuAddress_ = nullptr;
+ mapCount_++;
+ }
result = image_->BindGpuMemory(memRef_->gpuMem_, offset_);
if (result != Pal::Result::Success) {
@@ -1070,6 +1082,12 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
return false;
}
}
+ // Check if memory is locked already and restore CPU pointer
+ if (memRef_->cpuAddress_ != nullptr) {
+ address_ = memRef_->cpuAddress_;
+ memRef_->cpuAddress_ = nullptr;
+ mapCount_++;
+ }
return true;
}
@@ -1079,7 +1097,7 @@ void Resource::free() {
}
// Sanity check for the map calls
- if (mapCount_ != 0) {
+ if ((mapCount_ != 0) && (memoryType() != Remote) && (memoryType() != RemoteUSWC)) {
LogWarning("Resource wasn't unlocked, but destroyed!");
}
const bool wait =
@@ -1105,10 +1123,16 @@ void Resource::free() {
if (renames_.size() == 0) {
// Destroy GSL resource
if (iMem() != 0) {
- //! @note: This is a workaround for bad applications that
- //! don't unmap memory
if (mapCount_ != 0) {
- unmap(nullptr);
+ if ((memoryType() != Remote) && (memoryType() != RemoteUSWC)) {
+ //! @note: This is a workaround for bad applications that
+ //! don't unmap memory
+ unmap(nullptr);
+ } else {
+ // Delay CPU address unmap until memRef_ destruction
+ assert(memRef_->cpuAddress_ == nullptr && "Memref shouldn't have a valid CPU address");
+ memRef_->cpuAddress_ = address_;
+ }
}
// Add resource to the cache if it's not assigned to a specific queue
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 855574e0e2..95c9667e41 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -298,7 +298,7 @@ bool VirtualGPU::Queue::flush() {
palSdiRefs_.resize(0);
// Remove old memory references
- if ((memReferences_.size() > 1024) || (residency_size_ > residency_limit_)) {
+ if ((memReferences_.size() > 2048) || (residency_size_ > residency_limit_)) {
for (auto it = memReferences_.begin(); it != memReferences_.end();) {
if (it->second == cmdBufIdSlot_) {
palMems_.push_back(it->first->iMem());
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index b6e9c4997a..ef4d1644a7 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -41,8 +41,8 @@ class VirtualGPU : public device::VirtualDevice {
public:
class Queue : public amd::HeapObject {
public:
- static const uint MaxCmdBuffers = 8;
- static const uint MaxCommands = 512;
+ static const uint MaxCmdBuffers = 16;
+ static const uint MaxCommands = 256;
static const uint StartCmdBufIdx = 1;
static const uint FirstMemoryReference = 0x80000000;
static const uint64_t WaitTimeoutInNsec = 6000000000;