diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index cd19626b03..d846bfac45 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -1791,11 +1791,6 @@ class Device : public RuntimeObject { return false; }; - // Returns bool value if the device cache is equal to the parameter - virtual bool IsCacheFlushed(CacheState state) const { - return false; - }; - virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {}; virtual const uint32_t getPreferredNumaNode() const { return 0; } diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 7d7870d5f3..2417bc05e7 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -2656,18 +2656,6 @@ void Device::getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* } } -// ================================================================================================ -bool Device::IsCacheFlushed(Device::CacheState state) const { - - return ROC_EVENT_NO_FLUSH ? - (static_cast(state) == cache_state_.load(std::memory_order_relaxed)) : true; -} - -// ================================================================================================ -void Device::SetCacheState(Device::CacheState state) { - cache_state_.store(static_cast(state), std::memory_order_relaxed); -} - // ================================================================================================ static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) { if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index 44339bb9d0..f2811134e3 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -259,8 +259,6 @@ class NullDevice : public amd::Device { virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; } virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {}; - virtual bool IsCacheFlushed(Device::CacheState state) const { return false; }; - virtual void SetCacheState(Device::CacheState state) {}; virtual void ReleaseGlobalSignal(void* signal) const {} #if defined(__clang__) @@ -446,8 +444,6 @@ class Device : public NullDevice { virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const; virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const; - virtual bool IsCacheFlushed(Device::CacheState state) const; - virtual void SetCacheState(Device::CacheState state); virtual void ReleaseGlobalSignal(void* signal) const; //! Allocate host memory in terms of numa policy set by user diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 16656a98dd..25ad112466 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -819,9 +819,16 @@ bool VirtualGPU::dispatchGenericAqlPacket( uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size); uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); - auto cache_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + auto expected_fence_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); + if (fence_state_ == amd::Device::kCacheStateSystem && + expected_fence_state == amd::Device::kCacheStateSystem) { + header = dispatchPacketHeader_; + } + + fence_state_ = static_cast(expected_fence_state); + if (timestamp_ != nullptr) { // Get active signal for current dispatch if profiling is necessary packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); @@ -862,7 +869,8 @@ bool VirtualGPU::dispatchGenericAqlPacket( HSA_PACKET_HEADER_WIDTH_BARRIER), extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), - cache_state, + extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), rest, reinterpret_cast(packet)->grid_size_x, reinterpret_cast(packet)->grid_size_y, reinterpret_cast(packet)->grid_size_z, @@ -876,11 +884,8 @@ bool VirtualGPU::dispatchGenericAqlPacket( reinterpret_cast(packet)->completion_signal); } - //hsa_queue_store_write_index_release(gpu_queue_, index); hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index - 1); - roc_device_.SetCacheState(static_cast(cache_state)); - // Wait on signal ? if (blocking) { LogInfo("Runtime reachead the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!"); @@ -1004,13 +1009,12 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, HSA_PACKET_HEADER_WIDTH_BARRIER), extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), - cache_state, + extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), barrier_packet_.dep_signal[0], barrier_packet_.dep_signal[1], barrier_packet_.dep_signal[2], barrier_packet_.dep_signal[3], barrier_packet_.dep_signal[4], barrier_packet_.completion_signal); - roc_device_.SetCacheState(static_cast(cache_state)); - // Clear dependent signals for the next packet barrier_packet_.dep_signal[0] = hsa_signal_t{}; barrier_packet_.dep_signal[1] = hsa_signal_t{}; @@ -1069,7 +1073,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, kernarg_pool_signal_(KernelArgPoolNumSignal), cuMask_(cuMask), priority_(priority), - copy_command_type_(0) + copy_command_type_(0), + fence_state_(Device::CacheState::kCacheStateInvalid) { index_ = device.numOfVgpus_++; gpu_device_ = device.getBackendDevice(); @@ -1088,23 +1093,23 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, if (device.settings().fenceScopeAgent_) { dispatchPacketHeaderNoSync_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); dispatchPacketHeader_= (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); } else { dispatchPacketHeaderNoSync_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); dispatchPacketHeader_= (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); } aqlHeader_ = dispatchPacketHeader_; @@ -2334,7 +2339,6 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index); - roc_device_.SetCacheState(static_cast(cache_state)); ClPrint(amd::LOG_DEBUG, amd::LOG_AQL, "HWq=0x%zx, BarrierValue Header = 0x%x AmdFormat = 0x%x ", @@ -3004,10 +3008,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, if (addSystemScope_ || (vcmd != nullptr && vcmd->getEventScope() == amd::Device::kCacheStateSystem)) { - aqlHeaderWithOrder &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE | - HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - aqlHeaderWithOrder |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE | - HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + aqlHeaderWithOrder &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE | + HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); + aqlHeaderWithOrder |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE | + HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); addSystemScope_ = false; } @@ -3151,9 +3155,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { profilingBegin(vcmd); if (timestamp_ != nullptr) { int32_t releaseFlags = vcmd.getEventScope(); - if (ROC_EVENT_NO_FLUSH && releaseFlags == Device::CacheState::kCacheStateIgnore) { - dispatchBarrierPacket(kNopPacketHeader, false); - } else if (releaseFlags == Device::CacheState::kCacheStateAgent) { + if (releaseFlags == Device::CacheState::kCacheStateAgent) { dispatchBarrierPacket(kBarrierPacketAgentScopeHeader, false); } else { // Submit a barrier with a cache flushes. diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 01b1ee3e27..21c5c12951 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -533,5 +533,7 @@ class VirtualGPU : public device::VirtualDevice { cl_command_type copy_command_type_; //!< Type of the copy command, used for ROC profiler //!< OCL doesn't distinguish diffrent copy types, //!< but ROC profiler expects D2H or H2D detection + int fence_state_; //!< Fence scope + //!< kUnknown/kFlushedToDevice/kFlushedToSystem }; } diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index 070b239cf3..9ecd870e19 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -111,14 +111,13 @@ bool HostQueue::terminate() { void HostQueue::finish() { Command* command = nullptr; - bool isCacheFlushed = device().IsCacheFlushed(Device::CacheState::kCacheStateSystem); if (IS_HIP) { command = getLastQueuedCommand(true); - if (AMD_DIRECT_DISPATCH && isCacheFlushed && command == nullptr) { + if (AMD_DIRECT_DISPATCH && command == nullptr) { return; } } - if (nullptr == command || !isCacheFlushed || vdev()->isHandlerPending()) { + if (nullptr == command || vdev()->isHandlerPending()) { if (nullptr != command) { command->release(); } @@ -127,7 +126,7 @@ void HostQueue::finish() { if (command == NULL) { return; } - ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued, Cache Flushed = %d", isCacheFlushed); + ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to ensure finish"); command->enqueue(); } // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp index c9d25e0887..f9505d31a7 100644 --- a/rocclr/utils/flags.hpp +++ b/rocclr/utils/flags.hpp @@ -275,9 +275,7 @@ release(uint, ROC_SIGNAL_POOL_SIZE, 32, \ release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \ "If true, then runtime can skip kernel arg copy") \ release(bool, GPU_STREAMOPS_CP_WAIT, false, \ - "Force the stream wait memory operation to wait on CP.") \ -release(bool, ROC_EVENT_NO_FLUSH, false, \ - "Use NOP AQL packet for event records with no explicit flags.") + "Force the stream wait memory operation to wait on CP.") namespace amd {