From 33c1e3d14de73ef7eb3ba2cc89928c0fb16124fd Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Wed, 17 Feb 2021 14:48:08 -0500 Subject: [PATCH] SWDEV-272496 - Wait on CPU before switching to GPU wait GPU waits have noticeable overheads on compute with extra AQL barrier packet and on SDMA with power saving features. This change introduces a wait on CPU for 30 us in case the app has tiny operations. Change-Id: I761ba3af595f3f48544980058a9077dda15aa5f9 [ROCm/clr commit: ac387f9b03053f5350bca0386a658c3914870fc2] --- .../clr/rocclr/device/rocm/rocvirtual.cpp | 14 +++++---- .../clr/rocclr/device/rocm/rocvirtual.hpp | 30 ++++++++++++------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 414dd87191..c2a988d1fa 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -388,11 +388,15 @@ hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) { // Early signal status check if (hsa_signal_load_relaxed(prof_signal->signal_) > 0) { const Settings& settings = gpu_.dev().settings(); - // Wait on CPU if requested - if (settings.cpu_wait_for_signal_) { - CpuWaitForSignal(prof_signal); - } else { - return &prof_signal->signal_; + // Actively wait on CPU for 30 us to avoid extra overheads of signal tracking on GPU + if (!WaitForSignal(prof_signal->signal_)) { + if (settings.cpu_wait_for_signal_) { + // Wait on CPU for completion if requested + CpuWaitForSignal(prof_signal); + } else { + // Return HSA signal for tracking on GPU + return &prof_signal->signal_; + } } } } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 2ebf5ede6d..74f3b5f0f3 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -50,21 +50,31 @@ struct ProfilingSignal : public amd::HeapObject { }; // Initial HSA signal value -constexpr hsa_signal_value_t kInitSignalValueOne = 1; +constexpr static hsa_signal_value_t kInitSignalValueOne = 1; +// Timeouts for HSA signal wait +constexpr static uint64_t kTimeout30us = 30000; +constexpr static uint64_t kUnlimitedWait = std::numeric_limits::max(); + +template inline bool WaitForSignal(hsa_signal_t signal) { - constexpr uint64_t Timeout30us = 30000; - constexpr uint64_t UnlimitedWait = std::numeric_limits::max(); - uint64_t timeout = (ROC_ACTIVE_WAIT) ? UnlimitedWait : Timeout30us; - - // Active wait with a timeout - if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, - timeout, HSA_WAIT_STATE_ACTIVE) != 0) { - // Wait until the completion with CPU suspend + if (wait_time != 0) { if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, - UnlimitedWait, HSA_WAIT_STATE_BLOCKED) != 0) { + wait_time, HSA_WAIT_STATE_ACTIVE) != 0) { return false; } + } else { + uint64_t timeout = (ROC_ACTIVE_WAIT) ? kUnlimitedWait : kTimeout30us; + + // Active wait with a timeout + if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, + timeout, HSA_WAIT_STATE_ACTIVE) != 0) { + // Wait until the completion with CPU suspend + if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, + kUnlimitedWait, HSA_WAIT_STATE_BLOCKED) != 0) { + return false; + } + } } return true; }