diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index 4db0dc6bc8..8e60af681b 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -1527,6 +1527,13 @@ class Device : public RuntimeObject { kKernArg = 2 } MemorySegment; + typedef enum CacheState { + kCacheStateInvalid = -1, + kCacheStateIgnore = 0, + kCacheStateAgent = 1, + kCacheStateSystem = 2 + } CacheState; + typedef std::pair LinkAttrType; static constexpr size_t kP2PStagingSize = 4 * Mi; @@ -1749,6 +1756,11 @@ class Device : public RuntimeObject { return false; }; + // Returns bool value if the device cache is equal to the parameter + virtual bool IsCacheFlushed(CacheState state) const { + return false; + }; + virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {}; virtual const uint32_t getPreferredNumaNode() const { return 0; } diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 8b7c33f7cc..4045d7cc99 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -180,6 +180,7 @@ Device::Device(hsa_agent_t bkendDevice) gpuvm_segment_.handle = 0; gpu_fine_grained_segment_.handle = 0; prefetch_signal_.handle = 0; + cache_state_ = Device::CacheState::kCacheStateInvalid; } void Device::setupCpuAgent() { @@ -2635,6 +2636,17 @@ void Device::getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* start, end); } } + +// ================================================================================================ +bool Device::IsCacheFlushed(Device::CacheState state) const { + return (static_cast(state) == cache_state_.load(std::memory_order_relaxed)); +} + +// ================================================================================================ +void Device::SetCacheState(Device::CacheState state) { + cache_state_.store(static_cast(state), std::memory_order_relaxed); +} + // ================================================================================================ static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) { if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index 5004941a90..b908cad8d0 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -260,6 +260,8 @@ class NullDevice : public amd::Device { virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; } virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {}; + virtual bool IsCacheFlushed(Device::CacheState state) const { return false; }; + virtual void SetCacheState(Device::CacheState state) {}; virtual void ReleaseGlobalSignal(void* signal) const {} #if defined(__clang__) @@ -440,6 +442,8 @@ class Device : public NullDevice { virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const; virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const; + virtual bool IsCacheFlushed(Device::CacheState state) const; + virtual void SetCacheState(Device::CacheState state); virtual void ReleaseGlobalSignal(void* signal) const; //! Allocate host memory in terms of numa policy set by user @@ -583,6 +587,7 @@ class Device : public NullDevice { hsa_amd_memory_pool_t gpuvm_segment_; hsa_amd_memory_pool_t gpu_fine_grained_segment_; hsa_signal_t prefetch_signal_; //!< Prefetch signal, used to explicitly prefetch SVM on device + std::atomic cache_state_; //!< State of cache, kUnknown/kFlushedToDevice/kFlushedToSystem size_t gpuvm_segment_max_alloc_; size_t alloc_granularity_; diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index b8b5d14341..7619abd224 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -77,6 +77,11 @@ static constexpr uint16_t kBarrierPacketHeader = (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); +static constexpr uint16_t kBarrierPacketAgentScopeHeader = + (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + static constexpr uint16_t kNopPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | @@ -126,7 +131,7 @@ void Timestamp::checkGpuTime() { } // Avoid profiling data for the sync barrier, in tiny performance tests the first call // to ROCr is very slow and that also affects the overall performance of the callback thread - if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_) { + if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_ > 0) { hsa_amd_profiling_dispatch_time_t time = {}; if (it->engine_ == HwQueueEngine::Compute) { hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time); @@ -442,7 +447,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( // Update the current command/marker with HW event prof_signal->retain(); ts->command().SetHwEvent(prof_signal); - } else if (ts->command().profilingInfo().marker_ts_ ) { + } else if (ts->command().profilingInfo().marker_ts_ > 0 ) { // Update the current command/marker with HW event prof_signal->retain(); ts->command().SetHwEvent(prof_signal); @@ -803,11 +808,15 @@ bool VirtualGPU::dispatchGenericAqlPacket( uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size); uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); + auto cache_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); + if (timestamp_ != nullptr) { // Pool size must grow to the size of pending AQL packets const uint32_t pool_size = index - read; // Get active signal for current dispatch if profiling is necessary - packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size); + packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, + pool_size); } // Make sure the slot is free for usage @@ -845,8 +854,7 @@ bool VirtualGPU::dispatchGenericAqlPacket( HSA_PACKET_HEADER_WIDTH_BARRIER), extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), - extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, - HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), + cache_state, rest, reinterpret_cast(packet)->grid_size_x, reinterpret_cast(packet)->grid_size_y, reinterpret_cast(packet)->grid_size_z, @@ -863,6 +871,8 @@ bool VirtualGPU::dispatchGenericAqlPacket( //hsa_queue_store_write_index_release(gpu_queue_, index); hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index - 1); + roc_device_.SetCacheState(static_cast(cache_state)); + // Wait on signal ? if (blocking) { LogInfo("Runtime reachead the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!"); @@ -957,6 +967,8 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); + auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); if (!skipSignal) { // Pool size must grow to the size of pending AQL packets const uint32_t pool_size = index - read; @@ -987,11 +999,13 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, HSA_PACKET_HEADER_WIDTH_BARRIER), extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), - extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, - HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), + cache_state, barrier_packet_.dep_signal[0], barrier_packet_.dep_signal[1], barrier_packet_.dep_signal[2], barrier_packet_.dep_signal[3], barrier_packet_.dep_signal[4], barrier_packet_.completion_signal); + + roc_device_.SetCacheState(static_cast(cache_state)); + // Clear dependent signals for the next packet barrier_packet_.dep_signal[0] = hsa_signal_t{}; barrier_packet_.dep_signal[1] = hsa_signal_t{}; @@ -2303,7 +2317,12 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t unsigned int* headerPtr = reinterpret_cast(&header); __atomic_store_n(reinterpret_cast(aql_loc), *headerPtr, __ATOMIC_RELEASE); + auto cache_state = extractAqlBits(header.header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); + hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index); + roc_device_.SetCacheState(static_cast(cache_state)); + ClPrint(amd::LOG_DEBUG, amd::LOG_AQL, "HWq=0x%zx, BarrierValue Header = 0x%x AmdFormat = 0x%x ", "(type=%d, barrier=%d, acquire=%d, release=%d), " @@ -2313,8 +2332,7 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t extractAqlBits(header.header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER), extractAqlBits(header.header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), - extractAqlBits(header.header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, - HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), + cache_state, packet->completion_signal, packet->value, packet->mask, packet->cond, HSA_SIGNAL_CONDITION_GTE, HSA_SIGNAL_CONDITION_EQ, HSA_SIGNAL_CONDITION_NE); } @@ -3096,10 +3114,16 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { } else { profilingBegin(vcmd); if (timestamp_ != nullptr) { - // Submit a barrier with a cache flushes. - dispatchBarrierPacket(kBarrierPacketHeader, false); - - hasPendingDispatch_ = false; + uint32_t releaseFlags = vcmd.profilingInfo().marker_ts_; + if (ROC_EVENT_NO_FLUSH && releaseFlags == Device::CacheState::kCacheStateIgnore) { + dispatchBarrierPacket(kNopPacketHeader, false); + } else if (releaseFlags == Device::CacheState::kCacheStateAgent) { + dispatchBarrierPacket(kBarrierPacketAgentScopeHeader, false); + } else { + // Submit a barrier with a cache flushes. + dispatchBarrierPacket(kBarrierPacketHeader, false); + hasPendingDispatch_ = false; + } } profilingEnd(vcmd); } diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp index eec70f483d..cdc09f4215 100644 --- a/rocclr/platform/command.cpp +++ b/rocclr/platform/command.cpp @@ -55,8 +55,14 @@ Event::Event(HostQueue& queue) } // ================================================================================================ -Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED), - hw_event_(nullptr), notify_event_(nullptr), device_(nullptr) { notified_.clear(); } +Event::Event() + : callbacks_(NULL), + status_(CL_SUBMITTED), + hw_event_(nullptr), + notify_event_(nullptr), + device_(nullptr) { + notified_.clear(); +} // ================================================================================================ Event::~Event() { @@ -241,7 +247,7 @@ bool Event::awaitCompletion() { return false; } - ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d", + ClPrint(LOG_DEBUG, LOG_WAIT, "Waiting for event %p to complete, current status %d", this, status()); auto* queue = command().queue(); if ((queue != nullptr) && queue->vdev()->ActiveWait()) { @@ -256,7 +262,7 @@ bool Event::awaitCompletion() { lock_.wait(); } } - ClPrint(LOG_DEBUG, LOG_WAIT, "event %p wait completed", this); + ClPrint(LOG_DEBUG, LOG_WAIT, "Event %p wait completed", this); } return status() == CL_COMPLETE; @@ -277,7 +283,7 @@ bool Event::notifyCmdQueue(bool cpu_wait) { notified_.clear(); return false; } - ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue); + ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue); command->enqueue(); // Save notification, associated with the current event notify_event_ = command; @@ -290,7 +296,7 @@ bool Event::notifyCmdQueue(bool cpu_wait) { notified_.clear(); return false; } - ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue); + ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue); command->enqueue(); command->release(); } @@ -336,7 +342,7 @@ void Command::enqueue() { Agent::postEventCreate(as_cl(static_cast(this)), type_); } - ClPrint(LOG_DEBUG, LOG_CMD, "command is enqueued: %p", this); + ClPrint(LOG_DEBUG, LOG_CMD, "Command enqueued: %p", this); // Direct dispatch logic below will submit the command immediately, but the command status // update will occur later after flush() with a wait @@ -360,7 +366,7 @@ void Command::enqueue() { EnableProfiling(); } - if (isMarker && !profilingInfo().marker_ts_) { + if (isMarker && (profilingInfo().marker_ts_ == 0)) { // Update batch head for the current marker. Hence the status of all commands can be // updated upon the marker completion SetBatchHead(queue_->GetSubmittionBatch()); @@ -414,6 +420,7 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList profilingInfo_.enabled_ = true; profilingInfo_.clear(); profilingInfo_.callback_ = nullptr; + profilingInfo_.marker_ts_ = 1; } kernel_.retain(); } diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index 395d4642e0..4ea2636ddc 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -102,7 +102,7 @@ class Event : public RuntimeObject { static const EventWaitList nullWaitList; struct ProfilingInfo { - ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(false) { + ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(0) { if (enabled) { clear(); callback_ = nullptr; @@ -113,10 +113,11 @@ class Event : public RuntimeObject { uint64_t submitted_; uint64_t start_; uint64_t end_; - bool enabled_; //!< Profiling enabled for the wave limiter - uint32_t waves_; //!< The number of waves used in a dispatch + bool enabled_; //!< Profiling enabled for the wave limiter + uint32_t waves_; //!< The number of waves used in a dispatch ProfilingCallback* callback_; - bool marker_ts_; + uint32_t marker_ts_; //!< Marker with release scope + //!< 5 - system scope, 3 - device scope, 1 - no scopes void clear() { queued_ = 0ULL; submitted_ = 0ULL; diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index e8b914971d..daae776bd5 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -106,20 +106,20 @@ bool HostQueue::terminate() { void HostQueue::finish() { Command* command = nullptr; + bool isCacheFlushed = device().IsCacheFlushed(Device::CacheState::kCacheStateSystem); if (IS_HIP) { command = getLastQueuedCommand(true); - // Check if the queue has nothing to process and return - if (AMD_DIRECT_DISPATCH && command == nullptr) { + if (AMD_DIRECT_DISPATCH && isCacheFlushed && command == nullptr) { return; } } - if (nullptr == command) { + if (nullptr == command || !isCacheFlushed) { // Send a finish to make sure we finished all commands command = new Marker(*this, false); if (command == NULL) { return; } - ClPrint(LOG_DEBUG, LOG_CMD, "marker is queued"); + ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued, Cache Flushed = %d", isCacheFlushed); command->enqueue(); } // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status @@ -194,7 +194,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) { continue; } - ClPrint(LOG_DEBUG, LOG_CMD, "command (%s) is submitted: %p", getOclCommandKindString(command->type()), command); + ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) submitted: %p", getOclCommandKindString(command->type()), command); command->setStatus(CL_SUBMITTED); diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp index 147349cc6b..0912657d2e 100644 --- a/rocclr/utils/flags.hpp +++ b/rocclr/utils/flags.hpp @@ -273,7 +273,9 @@ release(uint, ROC_AQL_QUEUE_SIZE, 4096, \ release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \ "If true, then runtime can skip kernel arg copy") \ release(bool, GPU_STREAMOPS_CP_WAIT, false, \ - "Force the stream wait memory operation to wait on CP.") + "Force the stream wait memory operation to wait on CP.") \ +release(bool, ROC_EVENT_NO_FLUSH, false, \ + "Use NOP AQL packet for event records with no explicit flags.") namespace amd {