P4 to Git Change 1458879 by gandryey@gera-w8 on 2017/09/14 11:41:44

SWDEV-129129 - [[CQE OCL][Vega vs Fiji] Upto 12% Performance drop observed on VEGA10 compared to FIJI while running BlackMagic Davinci Resolve
	More benchmark tuning:
	- Keep system memory locked in the resource cache. That removes huge amount of lock/unlock calls to OS due to the resource creation and destruciton
	- Reduce the command buffer size to 256 commands and incrrease the amount of CBs to 16
	- Increase the amount of resident resources to 2048

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#574 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#37 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#58 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#31 edit


[ROCm/clr commit: 4066449a8b]
Этот коммит содержится в:
foreman
2017-09-14 11:58:52 -04:00
родитель 80971280eb
Коммит a12776ba65
5 изменённых файлов: 39 добавлений и 8 удалений
+6
Просмотреть файл
@@ -1337,6 +1337,12 @@ gpu::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const {
(owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER))
? Resource::Remote
: Resource::Local;
// Check if runtime can force a tiny buffer into USWC memory
if ((size <= (GPU_MAX_REMOTE_MEM_SIZE * Ki)) && (type == Resource::Local) &&
(owner.getMemFlags() & CL_MEM_READ_ONLY)) {
type = Resource::RemoteUSWC;
}
if (owner.getMemFlags() & CL_MEM_BUS_ADDRESSABLE_AMD) {
type = Resource::BusAddressable;
+2 -1
Просмотреть файл
@@ -416,7 +416,8 @@ Memory::~Memory() {
if ((owner() != nullptr) && isHostMemDirectAccess() && !(flags_ & SubMemoryObject) &&
(memoryType() != Resource::ExternalPhysical)) {
// Unmap memory if direct access was requested
unmap(nullptr);
// Note: runtime will perform unmap on the actual resource destruction
//unmap(nullptr);
}
}
+28 -4
Просмотреть файл
@@ -744,6 +744,12 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
}
}
}
// Check if memory is locked already and restore CPU pointer
if (memRef_->cpuAddress_ != nullptr) {
address_ = memRef_->cpuAddress_;
memRef_->cpuAddress_ = nullptr;
mapCount_++;
}
Pal::BufferViewInfo viewInfo = {};
viewInfo.gpuAddr = memRef_->iMem()->Desc().gpuVirtAddr + offset();
viewInfo.range = memRef_->iMem()->Desc().size;
@@ -890,6 +896,12 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
LogWarning("Image is bigger than the original mem object!");
}
}
// Check if memory is locked already and restore CPU pointer
if (memRef_->cpuAddress_ != nullptr) {
address_ = memRef_->cpuAddress_;
memRef_->cpuAddress_ = nullptr;
mapCount_++;
}
result = image_->BindGpuMemory(memRef_->gpuMem_, offset_);
if (result != Pal::Result::Success) {
@@ -1070,6 +1082,12 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
return false;
}
}
// Check if memory is locked already and restore CPU pointer
if (memRef_->cpuAddress_ != nullptr) {
address_ = memRef_->cpuAddress_;
memRef_->cpuAddress_ = nullptr;
mapCount_++;
}
return true;
}
@@ -1079,7 +1097,7 @@ void Resource::free() {
}
// Sanity check for the map calls
if (mapCount_ != 0) {
if ((mapCount_ != 0) && (memoryType() != Remote) && (memoryType() != RemoteUSWC)) {
LogWarning("Resource wasn't unlocked, but destroyed!");
}
const bool wait =
@@ -1105,10 +1123,16 @@ void Resource::free() {
if (renames_.size() == 0) {
// Destroy GSL resource
if (iMem() != 0) {
//! @note: This is a workaround for bad applications that
//! don't unmap memory
if (mapCount_ != 0) {
unmap(nullptr);
if ((memoryType() != Remote) && (memoryType() != RemoteUSWC)) {
//! @note: This is a workaround for bad applications that
//! don't unmap memory
unmap(nullptr);
} else {
// Delay CPU address unmap until memRef_ destruction
assert(memRef_->cpuAddress_ == nullptr && "Memref shouldn't have a valid CPU address");
memRef_->cpuAddress_ = address_;
}
}
// Add resource to the cache if it's not assigned to a specific queue
+1 -1
Просмотреть файл
@@ -298,7 +298,7 @@ bool VirtualGPU::Queue::flush() {
palSdiRefs_.resize(0);
// Remove old memory references
if ((memReferences_.size() > 1024) || (residency_size_ > residency_limit_)) {
if ((memReferences_.size() > 2048) || (residency_size_ > residency_limit_)) {
for (auto it = memReferences_.begin(); it != memReferences_.end();) {
if (it->second == cmdBufIdSlot_) {
palMems_.push_back(it->first->iMem());
+2 -2
Просмотреть файл
@@ -41,8 +41,8 @@ class VirtualGPU : public device::VirtualDevice {
public:
class Queue : public amd::HeapObject {
public:
static const uint MaxCmdBuffers = 8;
static const uint MaxCommands = 512;
static const uint MaxCmdBuffers = 16;
static const uint MaxCommands = 256;
static const uint StartCmdBufIdx = 1;
static const uint FirstMemoryReference = 0x80000000;
static const uint64_t WaitTimeoutInNsec = 6000000000;