From 51a4d1865c394bc02d00a3ebb203a0e5b294cae9 Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 29 May 2018 13:18:29 -0400
Subject: [PATCH] P4 to Git Change 1561015 by gandryey@gera-w8 on 2018/05/29
13:10:06
SWDEV-79445 - OCL generic changes and code clean-up
Optimize memory dependency tracking logic:
1. Add modified_ filed to the event object to track memory writes into device memory.
2. Check memory dependency only if a write operation is currently requested or previously performed.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#35 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#66 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#100 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#52 edit
---
rocclr/runtime/device/pal/paldefs.hpp | 19 ++++++----
rocclr/runtime/device/pal/palresource.cpp | 25 ++++++++++++
rocclr/runtime/device/pal/palresource.hpp | 6 +++
rocclr/runtime/device/pal/palvirtual.cpp | 46 ++++++++++++-----------
rocclr/runtime/device/pal/palvirtual.hpp | 11 +++---
5 files changed, 73 insertions(+), 34 deletions(-)
diff --git a/rocclr/runtime/device/pal/paldefs.hpp b/rocclr/runtime/device/pal/paldefs.hpp
index d98f4d5000..d0018afd9e 100644
--- a/rocclr/runtime/device/pal/paldefs.hpp
+++ b/rocclr/runtime/device/pal/paldefs.hpp
@@ -44,22 +44,27 @@ struct HwDbgKernelInfo {
enum EngineType { MainEngine = 0, SdmaEngine, AllEngines };
struct GpuEvent {
- static const unsigned int InvalidID = ((1 << 30) - 1);
+ static constexpr uint32_t InvalidID = ((1 << 30) - 1);
struct {
- uint32_t id : 31; ///< actual event id
- uint32_t engineId_ : 1; ///< type of the id
+ uint32_t id_ : 30; ///< Actual event id
+ uint32_t modified_ : 1; ///< Resource associated with the event was modified
+ uint32_t engineId_ : 1; ///< Type of the id
};
//! GPU event default constructor
- GpuEvent() : id(InvalidID), engineId_(MainEngine) {}
+ GpuEvent() : id_(InvalidID), engineId_(MainEngine), modified_(false) {}
//! GPU event constructor
- GpuEvent(uint evt) : id(evt), engineId_(MainEngine) {}
+ GpuEvent(uint evt) : id_(evt), engineId_(MainEngine), modified_(false) {}
//! Returns true if the current event is valid
- bool isValid() const { return (id != InvalidID) ? true : false; }
+ bool isValid() const { return (id_ != InvalidID) ? true : false; }
//! Set invalid event id
- void invalidate() { id = InvalidID; }
+ void invalidate() { id_ = InvalidID; }
+
+ // Overwrite default assign operator to preserve modified_ field
+ GpuEvent& operator=(const GpuEvent& evt)
+ { id_ = evt.id_; engineId_ = evt.engineId_; return *this; }
};
/*! \addtogroup PAL
diff --git a/rocclr/runtime/device/pal/palresource.cpp b/rocclr/runtime/device/pal/palresource.cpp
index cb16e3a695..42c03d25cf 100644
--- a/rocclr/runtime/device/pal/palresource.cpp
+++ b/rocclr/runtime/device/pal/palresource.cpp
@@ -1665,6 +1665,31 @@ GpuEvent* Resource::getGpuEvent(const VirtualGPU& gpu) const {
return &events_[idx];
}
+// ================================================================================================
+void Resource::setModified(VirtualGPU& gpu, bool modified) const {
+ uint idx = gpu.index();
+ assert(idx < events_.size());
+ events_[idx].modified_ = modified;
+
+ // If current resource is a view, then update the parent as well
+ if (viewOwner_ != nullptr) {
+ viewOwner_->setModified(gpu, modified);
+ }
+}
+
+// ================================================================================================
+bool Resource::isModified(VirtualGPU& gpu) const {
+ uint idx = gpu.index();
+ assert(idx < events_.size());
+ bool modified = events_[idx].modified_;
+
+ // If current resource is a view, then get the parent state as well
+ if (viewOwner_ != nullptr) {
+ modified |= viewOwner_->isModified(gpu);
+ }
+ return modified;
+}
+
// ================================================================================================
void Resource::palFree() const {
if (desc().type_ == OGLInterop) {
diff --git a/rocclr/runtime/device/pal/palresource.hpp b/rocclr/runtime/device/pal/palresource.hpp
index f25ed34e99..96f55c6f10 100644
--- a/rocclr/runtime/device/pal/palresource.hpp
+++ b/rocclr/runtime/device/pal/palresource.hpp
@@ -382,6 +382,12 @@ class Resource : public amd::HeapObject {
}
}
+ //! Update the modified field of the event, meaning the resource was updated
+ void setModified(VirtualGPU& gpu, bool modified) const;
+
+ //! Update the modified field of the event, meaning the resource was updated
+ bool isModified(VirtualGPU& gpu) const;
+
protected:
/*! \brief Creates a PAL iamge object, associated with the resource
*
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index 683d5f6fa6..126310a7ce 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -427,24 +427,26 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
uint64_t curStart = memory->vmAddress();
uint64_t curEnd = curStart + memory->size();
- // Loop through all memory objects in the queue and find dependency
- // @note don't include objects from the current kernel
- for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
- // Check if the queue already contains this mem object and
- // GPU operations aren't readonly
- uint64_t busyStart = memObjectsInQueue_[j].start_;
- uint64_t busyEnd = memObjectsInQueue_[j].end_;
+ if (memory->isModified(gpu) || !readOnly) {
+ // Loop through all memory objects in the queue and find dependency
+ // @note don't include objects from the current kernel
+ for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
+ // Check if the queue already contains this mem object and
+ // GPU operations aren't readonly
+ uint64_t busyStart = memObjectsInQueue_[j].start_;
+ uint64_t busyEnd = memObjectsInQueue_[j].end_;
- // Check if the start inside the busy region
- if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
- // Check if the end inside the busy region
- ((curEnd > busyStart) && (curEnd <= busyEnd)) ||
- // Check if the start/end cover the busy region
- ((curStart <= busyStart) && (curEnd >= busyEnd))) &&
- // If the buys region was written or the current one is for write
- (!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
- flushL1Cache = true;
- break;
+ // Check if the start inside the busy region
+ if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
+ // Check if the end inside the busy region
+ ((curEnd > busyStart) && (curEnd <= busyEnd)) ||
+ // Check if the start/end cover the busy region
+ ((curStart <= busyStart) && (curEnd >= busyEnd))) &&
+ // If the buys region was written or the current one is for write
+ (!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
+ flushL1Cache = true;
+ break;
+ }
}
}
@@ -471,6 +473,8 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd;
memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly;
numMemObjectsInQueue_++;
+ // Mark resource as modified
+ memory->setModified(gpu, !readOnly);
}
void VirtualGPU::MemoryDependency::clear(bool all) {
@@ -1955,7 +1959,7 @@ void VirtualGPU::PostDeviceEnqueue(
uint64_t vmParentWrap,
GpuEvent* gpuEvent)
{
- uint32_t id = gpuEvent->id;
+ uint32_t id = gpuEvent->id_;
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
// Make sure exculsive access to the device queue
@@ -2036,7 +2040,7 @@ void VirtualGPU::PostDeviceEnqueue(
iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
signalAddr, dev().settings().useDeviceQueue_);
- if (id != gpuEvent->id) {
+ if (id != gpuEvent->id_) {
LogError("Something is wrong. ID mismatch!\n");
}
eventEnd(MainEngine, *gpuEvent);
@@ -2133,7 +2137,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
for (int iter = 0; iter < iteration; ++iter) {
GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId());
- uint32_t id = gpuEvent.id;
+ uint32_t id = gpuEvent.id_;
// Reset global size for dimension dim if split is needed
if (dim != -1) {
newOffset[dim] = sizes.offset()[dim] + globalStep * iter;
@@ -2184,7 +2188,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
if (profiling() || state_.profileEnabled_) {
addBarrier();
}
- if (id != gpuEvent.id) {
+ if (id != gpuEvent.id_) {
LogError("Something is wrong. ID mismatch!\n");
}
eventEnd(MainEngine, gpuEvent);
diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp
index 9ca48d151a..fa48024c9d 100644
--- a/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/rocclr/runtime/device/pal/palvirtual.hpp
@@ -463,11 +463,11 @@ class VirtualGPU : public device::VirtualDevice {
constexpr bool End = false;
if (forceExec) {
constexpr bool ForceFlush = true;
- event.id = queues_[engId]->submit(ForceFlush);
+ event.id_ = queues_[engId]->submit(ForceFlush);
profileEvent(engId, End);
} else {
profileEvent(engId, End);
- event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION);
+ event.id_ = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION);
}
event.engineId_ = engId;
}
@@ -475,7 +475,7 @@ class VirtualGPU : public device::VirtualDevice {
void waitForEvent(GpuEvent* event) const {
if (event->isValid()) {
assert(event->engineId_ < AllEngines);
- queues_[event->engineId_]->waitForEvent(event->id);
+ queues_[event->engineId_]->waitForEvent(event->id_);
event->invalidate();
}
}
@@ -483,7 +483,7 @@ class VirtualGPU : public device::VirtualDevice {
bool isDone(GpuEvent* event) {
if (event->isValid()) {
assert(event->engineId_ < AllEngines);
- if (queues_[event->engineId_]->isDone(event->id)) {
+ if (queues_[event->engineId_]->isDone(event->id_)) {
event->invalidate();
return true;
}
@@ -623,9 +623,8 @@ class VirtualGPU : public device::VirtualDevice {
};
inline void VirtualGPU::addVmMemory(const Memory* memory) {
- GpuEvent event(queues_[MainEngine]->cmdBufId());
queues_[MainEngine]->addCmdMemRef(memory->memRef());
- memory->setBusy(*this, event);
+ memory->setBusy(*this, queues_[MainEngine]->cmdBufId());
}
inline void VirtualGPU::AddKernel(const amd::Kernel& kernel) const {