From 51a4d1865c394bc02d00a3ebb203a0e5b294cae9 Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 29 May 2018 13:18:29 -0400 Subject: [PATCH] P4 to Git Change 1561015 by gandryey@gera-w8 on 2018/05/29 13:10:06 SWDEV-79445 - OCL generic changes and code clean-up Optimize memory dependency tracking logic: 1. Add modified_ filed to the event object to track memory writes into device memory. 2. Check memory dependency only if a write operation is currently requested or previously performed. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#35 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#66 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#100 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#52 edit --- rocclr/runtime/device/pal/paldefs.hpp | 19 ++++++---- rocclr/runtime/device/pal/palresource.cpp | 25 ++++++++++++ rocclr/runtime/device/pal/palresource.hpp | 6 +++ rocclr/runtime/device/pal/palvirtual.cpp | 46 ++++++++++++----------- rocclr/runtime/device/pal/palvirtual.hpp | 11 +++--- 5 files changed, 73 insertions(+), 34 deletions(-) diff --git a/rocclr/runtime/device/pal/paldefs.hpp b/rocclr/runtime/device/pal/paldefs.hpp index d98f4d5000..d0018afd9e 100644 --- a/rocclr/runtime/device/pal/paldefs.hpp +++ b/rocclr/runtime/device/pal/paldefs.hpp @@ -44,22 +44,27 @@ struct HwDbgKernelInfo { enum EngineType { MainEngine = 0, SdmaEngine, AllEngines }; struct GpuEvent { - static const unsigned int InvalidID = ((1 << 30) - 1); + static constexpr uint32_t InvalidID = ((1 << 30) - 1); struct { - uint32_t id : 31; ///< actual event id - uint32_t engineId_ : 1; ///< type of the id + uint32_t id_ : 30; ///< Actual event id + uint32_t modified_ : 1; ///< Resource associated with the event was modified + uint32_t engineId_ : 1; ///< Type of the id }; //! GPU event default constructor - GpuEvent() : id(InvalidID), engineId_(MainEngine) {} + GpuEvent() : id_(InvalidID), engineId_(MainEngine), modified_(false) {} //! GPU event constructor - GpuEvent(uint evt) : id(evt), engineId_(MainEngine) {} + GpuEvent(uint evt) : id_(evt), engineId_(MainEngine), modified_(false) {} //! Returns true if the current event is valid - bool isValid() const { return (id != InvalidID) ? true : false; } + bool isValid() const { return (id_ != InvalidID) ? true : false; } //! Set invalid event id - void invalidate() { id = InvalidID; } + void invalidate() { id_ = InvalidID; } + + // Overwrite default assign operator to preserve modified_ field + GpuEvent& operator=(const GpuEvent& evt) + { id_ = evt.id_; engineId_ = evt.engineId_; return *this; } }; /*! \addtogroup PAL diff --git a/rocclr/runtime/device/pal/palresource.cpp b/rocclr/runtime/device/pal/palresource.cpp index cb16e3a695..42c03d25cf 100644 --- a/rocclr/runtime/device/pal/palresource.cpp +++ b/rocclr/runtime/device/pal/palresource.cpp @@ -1665,6 +1665,31 @@ GpuEvent* Resource::getGpuEvent(const VirtualGPU& gpu) const { return &events_[idx]; } +// ================================================================================================ +void Resource::setModified(VirtualGPU& gpu, bool modified) const { + uint idx = gpu.index(); + assert(idx < events_.size()); + events_[idx].modified_ = modified; + + // If current resource is a view, then update the parent as well + if (viewOwner_ != nullptr) { + viewOwner_->setModified(gpu, modified); + } +} + +// ================================================================================================ +bool Resource::isModified(VirtualGPU& gpu) const { + uint idx = gpu.index(); + assert(idx < events_.size()); + bool modified = events_[idx].modified_; + + // If current resource is a view, then get the parent state as well + if (viewOwner_ != nullptr) { + modified |= viewOwner_->isModified(gpu); + } + return modified; +} + // ================================================================================================ void Resource::palFree() const { if (desc().type_ == OGLInterop) { diff --git a/rocclr/runtime/device/pal/palresource.hpp b/rocclr/runtime/device/pal/palresource.hpp index f25ed34e99..96f55c6f10 100644 --- a/rocclr/runtime/device/pal/palresource.hpp +++ b/rocclr/runtime/device/pal/palresource.hpp @@ -382,6 +382,12 @@ class Resource : public amd::HeapObject { } } + //! Update the modified field of the event, meaning the resource was updated + void setModified(VirtualGPU& gpu, bool modified) const; + + //! Update the modified field of the event, meaning the resource was updated + bool isModified(VirtualGPU& gpu) const; + protected: /*! \brief Creates a PAL iamge object, associated with the resource * diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp index 683d5f6fa6..126310a7ce 100644 --- a/rocclr/runtime/device/pal/palvirtual.cpp +++ b/rocclr/runtime/device/pal/palvirtual.cpp @@ -427,24 +427,26 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor uint64_t curStart = memory->vmAddress(); uint64_t curEnd = curStart + memory->size(); - // Loop through all memory objects in the queue and find dependency - // @note don't include objects from the current kernel - for (size_t j = 0; j < endMemObjectsInQueue_; ++j) { - // Check if the queue already contains this mem object and - // GPU operations aren't readonly - uint64_t busyStart = memObjectsInQueue_[j].start_; - uint64_t busyEnd = memObjectsInQueue_[j].end_; + if (memory->isModified(gpu) || !readOnly) { + // Loop through all memory objects in the queue and find dependency + // @note don't include objects from the current kernel + for (size_t j = 0; j < endMemObjectsInQueue_; ++j) { + // Check if the queue already contains this mem object and + // GPU operations aren't readonly + uint64_t busyStart = memObjectsInQueue_[j].start_; + uint64_t busyEnd = memObjectsInQueue_[j].end_; - // Check if the start inside the busy region - if ((((curStart >= busyStart) && (curStart < busyEnd)) || - // Check if the end inside the busy region - ((curEnd > busyStart) && (curEnd <= busyEnd)) || - // Check if the start/end cover the busy region - ((curStart <= busyStart) && (curEnd >= busyEnd))) && - // If the buys region was written or the current one is for write - (!memObjectsInQueue_[j].readOnly_ || !readOnly)) { - flushL1Cache = true; - break; + // Check if the start inside the busy region + if ((((curStart >= busyStart) && (curStart < busyEnd)) || + // Check if the end inside the busy region + ((curEnd > busyStart) && (curEnd <= busyEnd)) || + // Check if the start/end cover the busy region + ((curStart <= busyStart) && (curEnd >= busyEnd))) && + // If the buys region was written or the current one is for write + (!memObjectsInQueue_[j].readOnly_ || !readOnly)) { + flushL1Cache = true; + break; + } } } @@ -471,6 +473,8 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd; memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly; numMemObjectsInQueue_++; + // Mark resource as modified + memory->setModified(gpu, !readOnly); } void VirtualGPU::MemoryDependency::clear(bool all) { @@ -1955,7 +1959,7 @@ void VirtualGPU::PostDeviceEnqueue( uint64_t vmParentWrap, GpuEvent* gpuEvent) { - uint32_t id = gpuEvent->id; + uint32_t id = gpuEvent->id_; amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev()); // Make sure exculsive access to the device queue @@ -2036,7 +2040,7 @@ void VirtualGPU::PostDeviceEnqueue( iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, vmParentWrap + offsetof(AmdAqlWrap, child_counter), signalAddr, dev().settings().useDeviceQueue_); - if (id != gpuEvent->id) { + if (id != gpuEvent->id_) { LogError("Something is wrong. ID mismatch!\n"); } eventEnd(MainEngine, *gpuEvent); @@ -2133,7 +2137,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const for (int iter = 0; iter < iteration; ++iter) { GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId()); - uint32_t id = gpuEvent.id; + uint32_t id = gpuEvent.id_; // Reset global size for dimension dim if split is needed if (dim != -1) { newOffset[dim] = sizes.offset()[dim] + globalStep * iter; @@ -2184,7 +2188,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const if (profiling() || state_.profileEnabled_) { addBarrier(); } - if (id != gpuEvent.id) { + if (id != gpuEvent.id_) { LogError("Something is wrong. ID mismatch!\n"); } eventEnd(MainEngine, gpuEvent); diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp index 9ca48d151a..fa48024c9d 100644 --- a/rocclr/runtime/device/pal/palvirtual.hpp +++ b/rocclr/runtime/device/pal/palvirtual.hpp @@ -463,11 +463,11 @@ class VirtualGPU : public device::VirtualDevice { constexpr bool End = false; if (forceExec) { constexpr bool ForceFlush = true; - event.id = queues_[engId]->submit(ForceFlush); + event.id_ = queues_[engId]->submit(ForceFlush); profileEvent(engId, End); } else { profileEvent(engId, End); - event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION); + event.id_ = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION); } event.engineId_ = engId; } @@ -475,7 +475,7 @@ class VirtualGPU : public device::VirtualDevice { void waitForEvent(GpuEvent* event) const { if (event->isValid()) { assert(event->engineId_ < AllEngines); - queues_[event->engineId_]->waitForEvent(event->id); + queues_[event->engineId_]->waitForEvent(event->id_); event->invalidate(); } } @@ -483,7 +483,7 @@ class VirtualGPU : public device::VirtualDevice { bool isDone(GpuEvent* event) { if (event->isValid()) { assert(event->engineId_ < AllEngines); - if (queues_[event->engineId_]->isDone(event->id)) { + if (queues_[event->engineId_]->isDone(event->id_)) { event->invalidate(); return true; } @@ -623,9 +623,8 @@ class VirtualGPU : public device::VirtualDevice { }; inline void VirtualGPU::addVmMemory(const Memory* memory) { - GpuEvent event(queues_[MainEngine]->cmdBufId()); queues_[MainEngine]->addCmdMemRef(memory->memRef()); - memory->setBusy(*this, event); + memory->setBusy(*this, queues_[MainEngine]->cmdBufId()); } inline void VirtualGPU::AddKernel(const amd::Kernel& kernel) const {