From 3e603d986aaf438f00a607b185d2b7932983d7c5 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Fri, 20 Jan 2023 15:34:24 -0800 Subject: [PATCH] SWDEV-364604 - Add ROCclr support for hipEventDisableSystemFence Change-Id: I6127b432a8759359359a1890fda85bc401be6a56 --- rocclr/device/device.hpp | 3 +++ rocclr/device/pal/palvirtual.hpp | 2 ++ rocclr/device/rocm/rocvirtual.cpp | 42 ++++++++++++++++++------------- rocclr/device/rocm/rocvirtual.hpp | 1 + rocclr/platform/command.cpp | 1 - rocclr/platform/commandqueue.cpp | 2 +- 6 files changed, 31 insertions(+), 20 deletions(-) diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index cf5bdb33f0..3e65a3311a 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -1260,6 +1260,9 @@ class VirtualDevice : public amd::HeapObject { //! Returns fence state of the VirtualGPU virtual bool isFenceDirty() const = 0; + //! Resets fence state of the VirtualGPU + virtual void resetFenceDirty() = 0; + private: //! Disable default copy constructor VirtualDevice& operator=(const VirtualDevice&); diff --git a/rocclr/device/pal/palvirtual.hpp b/rocclr/device/pal/palvirtual.hpp index 8624064975..74fb039fae 100644 --- a/rocclr/device/pal/palvirtual.hpp +++ b/rocclr/device/pal/palvirtual.hpp @@ -359,6 +359,8 @@ class VirtualGPU : public device::VirtualDevice { bool isFenceDirty() const { return false; } + void resetFenceDirty() {} + //! Returns GPU device object associated with this kernel const Device& dev() const { return gpuDevice_; } diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index feeef1381c..f2850b557c 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -78,13 +78,8 @@ static constexpr uint16_t kBarrierPacketHeader = (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); -static constexpr uint16_t kBarrierPacketAgentScopeHeader = - (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - static constexpr uint16_t kNopPacketHeader = - (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | + (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); @@ -99,14 +94,16 @@ static constexpr uint16_t kBarrierPacketReleaseHeader = (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); static constexpr uint16_t kBarrierVendorPacketHeader = - (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); -static constexpr uint16_t kBarrierVendorPacketAgentScopeHeader = - (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); +static constexpr uint16_t kBarrierVendorPacketNopScopeHeader = + (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); static constexpr hsa_barrier_and_packet_t kBarrierAcquirePacket = { kBarrierPacketAcquireHeader, 0, 0, {{0}}, 0, {0}}; @@ -989,6 +986,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); + fence_dirty_ = true; auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); if (!skipSignal) { @@ -1001,7 +999,9 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, } // Reset fence_dirty_ flag if we submit a barrier - fence_dirty_ = false; + if (cache_state == amd::Device::kCacheStateSystem) { + fence_dirty_ = false; + } while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); hsa_barrier_and_packet_t* aql_loc = @@ -1063,6 +1063,10 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD } } + fence_dirty_ = true; + auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); + if (completionSignal.handle == 0) { // Get active signal for current dispatch if profiling is necessary barrier_value_packet_.completion_signal = @@ -1072,6 +1076,11 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD barrier_value_packet_.completion_signal = completionSignal; } + // Reset fence_dirty_ flag if we submit a barrier + if (cache_state == amd::Device::kCacheStateSystem) { + fence_dirty_ = false; + } + uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); hsa_amd_barrier_value_packet_t* aql_loc = &(reinterpret_cast( @@ -1079,9 +1088,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD *aql_loc = barrier_value_packet_; packet_store_release(reinterpret_cast(aql_loc), packetHeader, rest); - auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, - HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); - hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index); ClPrint(amd::LOG_DEBUG, amd::LOG_AQL, @@ -3252,11 +3258,11 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { if (timestamp_ != nullptr) { const Settings& settings = dev().settings(); int32_t releaseFlags = vcmd.getEventScope(); - if (releaseFlags == Device::CacheState::kCacheStateAgent) { + if (releaseFlags == Device::CacheState::kCacheStateIgnore) { if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) { - dispatchBarrierValuePacket(kBarrierVendorPacketAgentScopeHeader, true); + dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true); } else { - dispatchBarrierPacket(kBarrierPacketAgentScopeHeader, false); + dispatchBarrierPacket(kNopPacketHeader, false); } } else { // Submit a barrier with a cache flushes. diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 1553dfe94f..8c002658f5 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -407,6 +407,7 @@ class VirtualGPU : public device::VirtualDevice { void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } + void resetFenceDirty() { fence_dirty_ = false; } // } roc OpenCL integration private: //! Dispatches a barrier with blocking HSA signals diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp index e02cb153bd..279bfc3d7a 100644 --- a/rocclr/platform/command.cpp +++ b/rocclr/platform/command.cpp @@ -436,7 +436,6 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList profilingInfo_.clear(); profilingInfo_.callback_ = nullptr; profilingInfo_.marker_ts_ = true; - setEventScope(amd::Device::kCacheStateSystem); } kernel_.retain(); } diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index 8b27b4ce30..ce8ba4ae37 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -120,7 +120,7 @@ void HostQueue::finish() { return; } } - if (nullptr == command || vdev()->isHandlerPending()) { + if (nullptr == command || vdev()->isHandlerPending() || vdev()->isFenceDirty()) { if (nullptr != command) { command->release(); }