P4 to Git Change 1458879 by gandryey@gera-w8 on 2017/09/14 11:41:44
SWDEV-129129 - [[CQE OCL][Vega vs Fiji] Upto 12% Performance drop observed on VEGA10 compared to FIJI while running BlackMagic Davinci Resolve
More benchmark tuning:
- Keep system memory locked in the resource cache. That removes huge amount of lock/unlock calls to OS due to the resource creation and destruciton
- Reduce the command buffer size to 256 commands and incrrease the amount of CBs to 16
- Increase the amount of resident resources to 2048
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#574 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#37 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#58 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#31 edit
[ROCm/clr commit: 4066449a8b]
Этот коммит содержится в:
@@ -1337,6 +1337,12 @@ gpu::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const {
|
||||
(owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER))
|
||||
? Resource::Remote
|
||||
: Resource::Local;
|
||||
|
||||
// Check if runtime can force a tiny buffer into USWC memory
|
||||
if ((size <= (GPU_MAX_REMOTE_MEM_SIZE * Ki)) && (type == Resource::Local) &&
|
||||
(owner.getMemFlags() & CL_MEM_READ_ONLY)) {
|
||||
type = Resource::RemoteUSWC;
|
||||
}
|
||||
|
||||
if (owner.getMemFlags() & CL_MEM_BUS_ADDRESSABLE_AMD) {
|
||||
type = Resource::BusAddressable;
|
||||
|
||||
@@ -416,7 +416,8 @@ Memory::~Memory() {
|
||||
if ((owner() != nullptr) && isHostMemDirectAccess() && !(flags_ & SubMemoryObject) &&
|
||||
(memoryType() != Resource::ExternalPhysical)) {
|
||||
// Unmap memory if direct access was requested
|
||||
unmap(nullptr);
|
||||
// Note: runtime will perform unmap on the actual resource destruction
|
||||
//unmap(nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -744,6 +744,12 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check if memory is locked already and restore CPU pointer
|
||||
if (memRef_->cpuAddress_ != nullptr) {
|
||||
address_ = memRef_->cpuAddress_;
|
||||
memRef_->cpuAddress_ = nullptr;
|
||||
mapCount_++;
|
||||
}
|
||||
Pal::BufferViewInfo viewInfo = {};
|
||||
viewInfo.gpuAddr = memRef_->iMem()->Desc().gpuVirtAddr + offset();
|
||||
viewInfo.range = memRef_->iMem()->Desc().size;
|
||||
@@ -890,6 +896,12 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
|
||||
LogWarning("Image is bigger than the original mem object!");
|
||||
}
|
||||
}
|
||||
// Check if memory is locked already and restore CPU pointer
|
||||
if (memRef_->cpuAddress_ != nullptr) {
|
||||
address_ = memRef_->cpuAddress_;
|
||||
memRef_->cpuAddress_ = nullptr;
|
||||
mapCount_++;
|
||||
}
|
||||
|
||||
result = image_->BindGpuMemory(memRef_->gpuMem_, offset_);
|
||||
if (result != Pal::Result::Success) {
|
||||
@@ -1070,6 +1082,12 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Check if memory is locked already and restore CPU pointer
|
||||
if (memRef_->cpuAddress_ != nullptr) {
|
||||
address_ = memRef_->cpuAddress_;
|
||||
memRef_->cpuAddress_ = nullptr;
|
||||
mapCount_++;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1079,7 +1097,7 @@ void Resource::free() {
|
||||
}
|
||||
|
||||
// Sanity check for the map calls
|
||||
if (mapCount_ != 0) {
|
||||
if ((mapCount_ != 0) && (memoryType() != Remote) && (memoryType() != RemoteUSWC)) {
|
||||
LogWarning("Resource wasn't unlocked, but destroyed!");
|
||||
}
|
||||
const bool wait =
|
||||
@@ -1105,10 +1123,16 @@ void Resource::free() {
|
||||
if (renames_.size() == 0) {
|
||||
// Destroy GSL resource
|
||||
if (iMem() != 0) {
|
||||
//! @note: This is a workaround for bad applications that
|
||||
//! don't unmap memory
|
||||
if (mapCount_ != 0) {
|
||||
unmap(nullptr);
|
||||
if ((memoryType() != Remote) && (memoryType() != RemoteUSWC)) {
|
||||
//! @note: This is a workaround for bad applications that
|
||||
//! don't unmap memory
|
||||
unmap(nullptr);
|
||||
} else {
|
||||
// Delay CPU address unmap until memRef_ destruction
|
||||
assert(memRef_->cpuAddress_ == nullptr && "Memref shouldn't have a valid CPU address");
|
||||
memRef_->cpuAddress_ = address_;
|
||||
}
|
||||
}
|
||||
|
||||
// Add resource to the cache if it's not assigned to a specific queue
|
||||
|
||||
@@ -298,7 +298,7 @@ bool VirtualGPU::Queue::flush() {
|
||||
palSdiRefs_.resize(0);
|
||||
|
||||
// Remove old memory references
|
||||
if ((memReferences_.size() > 1024) || (residency_size_ > residency_limit_)) {
|
||||
if ((memReferences_.size() > 2048) || (residency_size_ > residency_limit_)) {
|
||||
for (auto it = memReferences_.begin(); it != memReferences_.end();) {
|
||||
if (it->second == cmdBufIdSlot_) {
|
||||
palMems_.push_back(it->first->iMem());
|
||||
|
||||
@@ -41,8 +41,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
public:
|
||||
class Queue : public amd::HeapObject {
|
||||
public:
|
||||
static const uint MaxCmdBuffers = 8;
|
||||
static const uint MaxCommands = 512;
|
||||
static const uint MaxCmdBuffers = 16;
|
||||
static const uint MaxCommands = 256;
|
||||
static const uint StartCmdBufIdx = 1;
|
||||
static const uint FirstMemoryReference = 0x80000000;
|
||||
static const uint64_t WaitTimeoutInNsec = 6000000000;
|
||||
|
||||
Ссылка в новой задаче
Block a user