From 9b045922a8a1702919cf20479b0703c7e1728f5a Mon Sep 17 00:00:00 2001 From: "Kudchadker, Saleel" Date: Wed, 6 Aug 2025 20:34:42 -0700 Subject: [PATCH] SWDEV-538195 - Introduce threshold for handler submission (#723) - When doing device/stream sync, we can submit a handler which may introduce some host side delays. Use DEBUG_CLR_BATCH_CPU_SYNC_SIZE to batch commands for host wait. Default for HIP is 8 commands. - Investigation is underway in ROCr but need to address this for now in HIP runtime. --- hipamd/src/hip_device.cpp | 3 +++ hipamd/src/hip_device_runtime.cpp | 2 +- rocclr/device/rocm/rocblit.cpp | 4 ++++ rocclr/device/rocm/rocvirtual.cpp | 9 ++++++++- rocclr/device/rocm/rocvirtual.hpp | 1 + rocclr/platform/commandqueue.cpp | 20 +++++++++++++++++--- rocclr/platform/commandqueue.hpp | 3 +++ rocclr/utils/flags.hpp | 4 +++- 8 files changed, 40 insertions(+), 6 deletions(-) diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp index 1fff0b2175..fef71fb7b7 100644 --- a/hipamd/src/hip_device.cpp +++ b/hipamd/src/hip_device.cpp @@ -32,6 +32,7 @@ namespace hip { // ================================================================================================ hip::Stream* Device::NullStream(bool wait) { + ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "NullStream %p, wait %d", null_stream_, wait); if (null_stream_ == nullptr) { amd::ScopedLock lock(lock_); if (null_stream_ == nullptr) { @@ -188,6 +189,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre if (wait_null_stream) { if (null_stream_) { + ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on nullstream %p", null_stream_); waitForStream(null_stream_); } } else { @@ -198,6 +200,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre ((active_stream->Flags() & hipStreamNonBlocking) == 0) && // and it's not the current stream (active_stream != blocking_stream)) { + ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on active stream %p", active_stream); // Get the last valid command waitForStream(active_stream); } diff --git a/hipamd/src/hip_device_runtime.cpp b/hipamd/src/hip_device_runtime.cpp index 6626e1672a..180ec9251e 100644 --- a/hipamd/src/hip_device_runtime.cpp +++ b/hipamd/src/hip_device_runtime.cpp @@ -674,7 +674,7 @@ hipError_t hipDeviceSynchronize() { CHECK_SUPPORTED_DURING_CAPTURE(); constexpr bool kDoWaitForCpu = false; hip::getCurrentDevice()->SyncAllStreams(kDoWaitForCpu); - HIP_RETURN(hipSuccess); + HIP_RETURN_DURATION(hipSuccess); } int ihipGetDevice() { diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index 81a8eab29b..40ff3454ae 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -358,6 +358,8 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d } } + // The hsa copy api would result in a dirty cache state + gpu().setFenceDirty(false); return true; } @@ -573,6 +575,8 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, if (status == HSA_STATUS_SUCCESS) { gpu().addSystemScope(); + // The hsa copy api would result in a dirty cache state + gpu().setFenceDirty(false); } else { gpu().Barriers().ResetCurrentSignal(); LogPrintfError("HSA copy failed with code %d, falling to Blit copy", status); diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index b8d4d63e2b..d5247ad82c 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -136,7 +136,7 @@ void Timestamp::checkGpuTime() { amd::ScopedLock lock(it->LockSignalOps()); // Ignore the wait if runtime processes API callback, because the signal value is bigger // than expected and the value reset will occur after API callback is done - if (GetCallbackSignal().handle == 0) { + if (GetCallbackSignal().handle == 0 || GetBlocking() == false) { WaitForSignal(it->signal_); } // Avoid profiling data for the sync barrier, in tiny performance tests the first call @@ -1006,6 +1006,7 @@ bool VirtualGPU::dispatchGenericAqlPacket( // Check for queue full and wait if needed. uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); + fence_dirty_ = true; if (addSystemScope_) { header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE | @@ -1018,6 +1019,12 @@ bool VirtualGPU::dispatchGenericAqlPacket( auto expected_fence_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE); + // Reset fence_dirty_ flag if we submit a packet with system scopes + if (expected_fence_state == amd::Device::kCacheStateSystem) { + fence_dirty_ = false; + } + + // Dirty optimization to save on consequent dispatch packets which have requested flushes if (fence_state_ == amd::Device::kCacheStateSystem && expected_fence_state == amd::Device::kCacheStateSystem) { header = dispatchPacketHeader_; diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 8ad16d3304..1d9c8b756b 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -449,6 +449,7 @@ class VirtualGPU : public device::VirtualDevice { void* allocKernArg(size_t size, size_t alignment); bool isFenceDirty() const { return fence_dirty_; } + void setFenceDirty(bool state) { fence_dirty_ = state; } void HiddenHeapInit(); void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; } diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index 1f47483bdd..665e5313f8 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -159,7 +159,11 @@ void HostQueue::finishCommand(Command* command) { void HostQueue::finish(bool cpu_wait) { Command* command = nullptr; + size_t minBatchSize = 0; + if (IS_HIP) { + minBatchSize = DEBUG_CLR_BATCH_CPU_SYNC_SIZE; + command = getLastQueuedCommand(true); if (command == nullptr) { return; @@ -170,23 +174,33 @@ void HostQueue::finish(bool cpu_wait) { cpu_wait = true; } } + + size_t batchSize = GetSubmissionBatchSize(); + ClPrint(LOG_DEBUG, LOG_CMD, + "finish() called with batch size: %zu, cpu_wait: %d, " + "fence dirty: %d", + batchSize, cpu_wait, vdev()->isFenceDirty()); + // Force marker if the batch wasn't sent for CPU update or fence is dirty if (nullptr == command || (GetSubmissionBatch() != nullptr) || vdev()->isFenceDirty()) { if (nullptr != command) { command->release(); } + const Command::EventWaitList nullWaitList = {}; // Send a finish to make sure we finished all commands - command = new Marker(*this, false); + command = new Marker(*this, false, nullWaitList, nullptr, batchSize < minBatchSize); if (command == NULL) { return; } - ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to %p for finish", this); command->enqueue(); } // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status static constexpr bool kWaitCompletion = true; if (cpu_wait || !device().IsHwEventReady(command->event(), kWaitCompletion)) { - ClPrint(LOG_DEBUG, LOG_CMD, "No HW event || cpu wait=%d, await command completion", cpu_wait); + ClPrint(LOG_DEBUG, LOG_CMD, + "No HW event or batch size is less than %zu, " + "await command completion", + minBatchSize); command->awaitCompletion(); if (IS_HIP) { diff --git a/rocclr/platform/commandqueue.hpp b/rocclr/platform/commandqueue.hpp index 94c63fca53..f46c59a7f3 100644 --- a/rocclr/platform/commandqueue.hpp +++ b/rocclr/platform/commandqueue.hpp @@ -251,6 +251,9 @@ class HostQueue : public CommandQueue { //! Get the submitted batch Command* GetSubmissionBatch() const { return head_; } + //! Get the current batch size + size_t GetSubmissionBatchSize() const { return size_; } + //! Insert a command into the linked list of submitted commands void FormSubmissionBatch(Command* command) { // Insert the command to the linked list. diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp index ab7a7b3dfc..722cb0174d 100644 --- a/rocclr/utils/flags.hpp +++ b/rocclr/utils/flags.hpp @@ -270,7 +270,9 @@ release(bool, DEBUG_HIP_DYNAMIC_QUEUES, false, \ release(uint, HIP_SKIP_ABORT_ON_GPU_ERROR, true, \ "Set this to true, to avoid host side abort for GPU errors") \ release(bool, HIP_FORCE_SPIRV_CODEOBJECT, false, \ - "Force use of SPIRV instead of device specific code object.") \ + "Force use of SPIRV instead of device specific code object.") \ +release(uint, DEBUG_CLR_BATCH_CPU_SYNC_SIZE, 8, \ + "Forces the minimum batch size for CPU sync") \ namespace amd {