From f96e9733787d2b0fb41e1b356189dc0ddc666b17 Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Fri, 22 Jan 2021 16:42:35 -0500 Subject: [PATCH] SWDEV-257787 - Add engine tracking per signal - The logic will trace compute, sdma read/write operations and apply signals when necessary - ROC_CPU_WAIT_FOR_SIGNAL, ROC_SYSTEM_SCOPE_SIGNAL and ROC_SKIP_COPY_SYNC were added to control the tracking Change-Id: I9e8e6174c63bf7784f7ab00964e2918c8667d364 [ROCm/clr commit: dbc7abaecf6a932570e40a134998246d88e7c2cd] --- projects/clr/rocclr/device/rocm/rocblit.cpp | 71 ++++-- projects/clr/rocclr/device/rocm/rocblit.hpp | 3 - projects/clr/rocclr/device/rocm/rocdefs.hpp | 6 +- .../clr/rocclr/device/rocm/rocsettings.cpp | 9 + .../clr/rocclr/device/rocm/rocsettings.hpp | 5 +- .../clr/rocclr/device/rocm/rocvirtual.cpp | 217 +++++++++++++++++- .../clr/rocclr/device/rocm/rocvirtual.hpp | 154 +++---------- projects/clr/rocclr/utils/flags.hpp | 6 + 8 files changed, 312 insertions(+), 159 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index e16b40ec37..d25e8e2f4f 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -433,18 +433,27 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d hsa_dim3_t offset = { 0, 0 ,0 }; - if ((srcRect.rowPitch_ % 4 != 0) || - (srcRect.slicePitch_ % 4 != 0) || - (dstRect.rowPitch_ % 4 != 0) || + if ((srcRect.rowPitch_ % 4 != 0) || + (srcRect.slicePitch_ % 4 != 0) || + (dstRect.rowPitch_ % 4 != 0) || (dstRect.slicePitch_ % 4 != 0)) { isSubwindowRectCopy = false; } + HwQueueEngine engine = HwQueueEngine::Unknown; + if ((srcAgent.handle == dev().getCpuAgent().handle) && + (dstAgent.handle != dev().getCpuAgent().handle)) { + engine = HwQueueEngine::SdmaWrite; + } else if ((srcAgent.handle != dev().getCpuAgent().handle) && + (dstAgent.handle == dev().getCpuAgent().handle)) { + engine = HwQueueEngine::SdmaRead; + } + + hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine); + uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1; + if (isSubwindowRectCopy ) { - hsa_signal_t wait = gpu().Barriers().WaitSignal(); hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); - uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1; - hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait; // Copy memory line by line hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset, @@ -457,10 +466,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d } else { // Fall to line by line copies const hsa_signal_value_t kInitVal = size[2] * size[1]; - hsa_signal_t wait = gpu().Barriers().WaitSignal(); hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitVal, gpu().timestamp()); - uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1; - hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait; for (size_t z = 0; z < size[2]; ++z) { for (size_t y = 0; y < size[1]; ++y) { @@ -472,18 +478,18 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d (reinterpret_cast
(dst) + dstOffset), dstAgent, (reinterpret_cast(src) + srcOffset), srcAgent, size[0], num_wait_events, wait_event, active); - gpu().setLastCommandSDMA(true) ; if (status != HSA_STATUS_SUCCESS) { gpu().Barriers().ResetCurrentSignal(); LogPrintfError("DMA buffer failed with code %d", status); return false; + } else { + gpu().setLastCommandSDMA(true); } } } } } - // Explicit wait for now, until runtime could distinguish compute and sdma operations - gpu().Barriers().WaitCurrent(); + return true; } @@ -644,18 +650,24 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, srcAgent = dstAgent = dev().getBackendDevice(); } - hsa_signal_t wait = gpu().Barriers().WaitSignal(); - hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); - uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1; - hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait; + HwQueueEngine engine = HwQueueEngine::Unknown; + if ((srcAgent.handle == dev().getCpuAgent().handle) && + (dstAgent.handle != dev().getCpuAgent().handle)) { + engine = HwQueueEngine::SdmaWrite; + } else if ((srcAgent.handle != dev().getCpuAgent().handle) && + (dstAgent.handle == dev().getCpuAgent().handle)) { + engine = HwQueueEngine::SdmaRead; + } + + hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine); + uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1; + hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); // Use SDMA to transfer the data status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], num_wait_events, wait_event, active); - gpu().setLastCommandSDMA(true); - // Explicit wait for now, until runtime could distinguish compute and sdma operations - gpu().Barriers().WaitCurrent(); if (status == HSA_STATUS_SUCCESS) { + gpu().setLastCommandSDMA(true); gpu().addSystemScope(); } else { gpu().Barriers().ResetCurrentSignal(); @@ -690,7 +702,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_ // Allocate requested size of memory while (totalSize > 0) { size = std::min(totalSize, dev().settings().stagedXferSize_); - hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); // Copy data from Host to Device if (hostToDev) { @@ -700,14 +711,22 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_ const hsa_agent_t srcAgent = (size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent(); + HwQueueEngine engine = HwQueueEngine::Unknown; + if (srcAgent.handle == dev().getBackendDevice().handle) { + engine = HwQueueEngine::SdmaWrite; + } + gpu().Barriers().SetActiveEngine(engine); + hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); + memcpy(hsaBuffer, hostSrc + offset, size); status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer, srcAgent, size, 0, nullptr, active); - gpu().setLastCommandSDMA(true); if (status != HSA_STATUS_SUCCESS) { gpu().Barriers().ResetCurrentSignal(); LogPrintfError("Hsa copy from host to device failed with code %d", status); return false; + } else { + gpu().setLastCommandSDMA(true); } gpu().Barriers().WaitCurrent(); totalSize -= size; @@ -721,14 +740,22 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_ const hsa_agent_t dstAgent = (size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent(); + HwQueueEngine engine = HwQueueEngine::Unknown; + if (dstAgent.handle == dev().getBackendDevice().handle) { + engine = HwQueueEngine::SdmaRead; + } + gpu().Barriers().SetActiveEngine(engine); + hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); + // Copy data from Device to Host status = hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset, dev().getBackendDevice(), size, 0, nullptr, active); - gpu().setLastCommandSDMA(true); if (status == HSA_STATUS_SUCCESS) { + gpu().setLastCommandSDMA(true); gpu().Barriers().WaitCurrent(); memcpy(hostDst + offset, hsaBuffer, size); } else { + gpu().Barriers().ResetCurrentSignal(); LogPrintfError("Hsa copy from device to host failed with code %d", status); return false; } diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp index 5f344ff1af..a81480f21d 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.hpp +++ b/projects/clr/rocclr/device/rocm/rocblit.hpp @@ -40,9 +40,6 @@ class Kernel; class Memory; class VirtualGPU; -constexpr bool kSkipCpuWait = true; -constexpr bool kIgnoreBarrier = false; - //! DMA Blit Manager class DmaBlitManager : public device::HostBlitManager { public: diff --git a/projects/clr/rocclr/device/rocm/rocdefs.hpp b/projects/clr/rocclr/device/rocm/rocdefs.hpp index 28284d76e0..44827de45d 100644 --- a/projects/clr/rocclr/device/rocm/rocdefs.hpp +++ b/projects/clr/rocclr/device/rocm/rocdefs.hpp @@ -33,11 +33,15 @@ static constexpr uint DeviceQueueMaskSize = 32; //! Set to match the number of pipes, which is 8. static constexpr uint kMaxAsyncQueues = 8; +constexpr bool kSkipCpuWait = true; +constexpr bool kIgnoreBarrier = false; + enum HwQueueEngine : uint32_t { Compute = 0, SdmaRead = 1, SdmaWrite = 2, - Unknown = 3 + Unknown = 3, + External = 4 }; } // namespace roc diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index 4222061abb..e2f62210f5 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -28,6 +28,7 @@ namespace roc { +// ================================================================================================ Settings::Settings() { // Initialize the HSA device default settings @@ -91,8 +92,15 @@ Settings::Settings() { rocr_backend_ = true; barrier_sync_ = (!flagIsDefault(ROC_BARRIER_SYNC)) ? ROC_BARRIER_SYNC : true; + + cpu_wait_for_signal_ = !AMD_DIRECT_DISPATCH; + cpu_wait_for_signal_ = (!flagIsDefault(ROC_CPU_WAIT_FOR_SIGNAL)) ? + ROC_CPU_WAIT_FOR_SIGNAL : cpu_wait_for_signal_; + system_scope_signal_ = ROC_SYSTEM_SCOPE_SIGNAL; + skip_copy_sync_ = ROC_SKIP_COPY_SYNC; } +// ================================================================================================ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor, bool enableXNACK, bool coop_groups) { customHostAllocator_ = false; @@ -169,6 +177,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor return true; } +// ================================================================================================ void Settings::override() { // Limit reported workgroup size if (GPU_MAX_WORKGROUP_SIZE != 0) { diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp index e0e29c0d27..5abace8b85 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp @@ -52,7 +52,10 @@ class Settings : public device::Settings { uint stagedXferWrite_ : 1; //!< Uses a staged buffer write uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10 uint barrier_sync_ : 1; //!< Use AQL barrier command to sync with CPU - uint reserved_ : 23; + uint cpu_wait_for_signal_ : 1; //!< Wait for HSA signal on CPU + uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system + uint skip_copy_sync_ : 1; //!< Ignore explicit HSA signal waits for copy functionality + uint reserved_ : 20; }; uint value_; }; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 7340d99e70..a4a6257bb2 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -35,10 +35,12 @@ #include "amd_hsa_kernel_code.h" #include -#include -#include #include +#include +#include #include +#include + /** * HSA image object size in bytes (see HSAIL spec) @@ -100,6 +102,37 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) { return (v >> pos) & ((1 << width) - 1); }; +// ================================================================================================ +void Timestamp::checkGpuTime() { + if (HwProfiling()) { + uint64_t start = std::numeric_limits::max(); + uint64_t end = 0; + + for (auto it : signals_) { + if (hsa_signal_load_relaxed(it->signal_) > 0) { + WaitForSignal(it->signal_); + } + hsa_amd_profiling_dispatch_time_t time = {}; + if (it->engine_ == HwQueueEngine::Compute) { + hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time); + } else { + hsa_amd_profiling_async_copy_time_t time_sdma = {}; + hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma); + time.start = time_sdma.start; + time.end = time_sdma.end; + } + start = std::min(time.start, start); + end = std::max(time.end, end); + it->ts_ = nullptr; + it->done_ = true; + } + signals_.clear(); + start_ = start * ticksToTime_; + end_ = end * ticksToTime_; + } +} + +// ================================================================================================ bool VirtualGPU::MemoryDependency::create(size_t numMemObj) { if (numMemObj > 0) { // Allocate the array of memory objects for dependency tracking @@ -114,6 +147,7 @@ bool VirtualGPU::MemoryDependency::create(size_t numMemObj) { return true; } +// ================================================================================================ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) { bool flushL1Cache = false; @@ -170,6 +204,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor numMemObjectsInQueue_++; } +// ================================================================================================ void VirtualGPU::MemoryDependency::clear(bool all) { if (numMemObjectsInQueue_ > 0) { size_t i, j; @@ -205,6 +240,143 @@ void VirtualGPU::MemoryDependency::clear(bool all) { } } +// ================================================================================================ +VirtualGPU::HwQueueTracker::~HwQueueTracker() { + for (auto& signal: signal_list_) { + if (signal->signal_.handle != 0) { + hsa_signal_destroy(signal->signal_); + } + delete signal; + } +} + +// ================================================================================================ +bool VirtualGPU::HwQueueTracker::Create() { + constexpr size_t kSignalListSize = 16; + signal_list_.resize(kSignalListSize); + + hsa_agent_t agent = gpu_.gpu_device(); + const Settings& settings = gpu_.dev().settings(); + hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent; + uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1; + + for (uint i = 0; i < kSignalListSize; ++i) { + std::unique_ptr signal(new ProfilingSignal()); + if ((signal == nullptr) || + (HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_))) { + return false; + } + signal_list_[i] = signal.release(); + } + return true; +} + +// ================================================================================================ +hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal( + hsa_signal_value_t init_val, Timestamp* ts, uint32_t queue_size) { + // If queue size grows, then add more signals to avoid more frequent stalls + if (queue_size > signal_list_.size()) { + std::unique_ptr signal(new ProfilingSignal()); + if (signal != nullptr) { + hsa_agent_t agent = gpu_.gpu_device(); + const Settings& settings = gpu_.dev().settings(); + hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent; + uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1; + + if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) { + signal_list_.push_back(signal.release()); + } + } + } + // Find valid index + ++current_id_ %= signal_list_.size(); + + // Make sure the previous operation on the current signal is done + WaitCurrent(); + + // Have to wait the next signal in the queue to avoid a race condition between + // a GPU waiter(which may be not triggered yet) and CPU signal reset below + WaitNext(); + + // Reset the signal and return + hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val); + signal_list_[current_id_]->done_ = false; + signal_list_[current_id_]->engine_ = engine_; + if (ts != 0) { + if (!sdma_profiling_) { + hsa_amd_profiling_async_copy_enable(true); + sdma_profiling_ = true; + } + signal_list_[current_id_]->ts_ = ts; + ts->AddProfilingSignal(signal_list_[current_id_]); + } + return signal_list_[current_id_]->signal_; +} + +// ================================================================================================ +hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) { + bool explicit_wait = false; + hsa_signal_t* signal = nullptr; + // Does runtime switch the active engine? + if (engine != engine_) { + // Yes, return the signla from the previous operation for a wait + engine_ = engine; + explicit_wait = true; + } else { + // Unknown engine in use, hence return a wait signal always + if (engine == HwQueueEngine::Unknown) { + explicit_wait = true; + } else { + // Check if skip wait optimizaiton is enabled. It will try to predice the same engine in ROCr + // and ignore signal wait, relying on in-order engine execution + const Settings& settings = gpu_.dev().settings(); + if (!settings.skip_copy_sync_ && (engine != HwQueueEngine::Compute)) { + explicit_wait = true; + } + } + } + // Check if a wait is required + if (explicit_wait) { + ProfilingSignal* prof_signal; + // Check if there is an external signal + if (external_signal_ != nullptr) { + prof_signal = external_signal_; + external_signal_ = nullptr; + } else { + prof_signal = signal_list_[current_id_]; + } + // Early signal status check + if (hsa_signal_load_relaxed(prof_signal->signal_) > 0) { + const Settings& settings = gpu_.dev().settings(); + // Wait on CPU if requested + if (settings.cpu_wait_for_signal_) { + CpuWaitForSignal(prof_signal); + } else { + return &prof_signal->signal_; + } + } + } + return signal; +} + +// ================================================================================================ +bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) { + // Wait for the current signal + if (!signal->done_) { + // Update timestamp values if requested + if (signal->ts_ != nullptr) { + signal->ts_->checkGpuTime(); + } else { + if (!WaitForSignal(signal->signal_)) { + LogPrintfError("Failed signal [0x%lx] wait", signal->signal_); + return false; + } + signal->done_ = true; + } + } + return true; +} + // ================================================================================================ void VirtualGPU::HwQueueTracker::ResetCurrentSignal() { // Reset the signal and return @@ -537,6 +709,16 @@ bool VirtualGPU::dispatchGenericAqlPacket( // ================================================================================================ bool VirtualGPU::dispatchAqlPacket( hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) { + hsa_signal_t* wait = Barriers().WaitingSignal(); + // AQL dispatch doesn't support dependent signals and extra barrier packet must be generated + if (wait != nullptr) { + barrier_packet_.dep_signal[0] = *wait; + constexpr bool kSkipSignal = true; + dispatchBarrierPacket(&barrier_packet_, kNopPacketHeader, kSkipSignal); + } else { + barrier_packet_.dep_signal[0] = hsa_signal_t{}; + } + return dispatchGenericAqlPacket(packet, header, rest, blocking); } @@ -587,6 +769,9 @@ void VirtualGPU::dispatchBarrierPacket(hsa_barrier_and_packet_t* packet, if (!skipSignal) { // Pool size must grow to the size of pending AQL packets const uint32_t pool_size = index - read; + hsa_signal_t* wait = Barriers().WaitingSignal(); + packet->dep_signal[0] = (wait != nullptr) ? *wait : hsa_signal_t{}; + // Get active signal for current dispatch if profiling is necessary packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size); @@ -663,6 +848,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, schedulerParam_(nullptr), schedulerQueue_(nullptr), schedulerSignal_({0}), + barriers_(*this), cuMask_(cuMask), priority_(priority), copy_command_type_(0) @@ -804,7 +990,7 @@ bool VirtualGPU::create() { } // Allocate signal tracker for ROCr copy queue - if (!Barriers().Create(gpu_device())) { + if (!Barriers().Create()) { LogError("Could not create signal for copy queue!"); return false; } @@ -867,7 +1053,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { return; } // Without barrier profiling will wait for each individual signal - timestamp_ = new Timestamp(); + timestamp_ = new Timestamp(dev().getBackendDevice()); timestamp_->start(); } } @@ -1193,10 +1379,9 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { #if AMD_HMM_SUPPORT profilingBegin(cmd); // Initialize signal for the barrier - hsa_signal_t wait = Barriers().WaitSignal(); - hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); - uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1; - hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait; + hsa_signal_t* wait_event = Barriers().WaitingSignal(HwQueueEngine::Unknown); + hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); + uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1; // Find the requested agent for the transfer hsa_agent_t agent = (cmd.cpu_access() || @@ -1207,7 +1392,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { hsa_status_t status = hsa_amd_svm_prefetch_async( const_cast(cmd.dev_ptr()), cmd.count(), agent, num_wait_events, wait_event, active); - // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution. + // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) { Barriers().ResetCurrentSignal(); LogError("hsa_amd_svm_prefetch_async failed"); @@ -2376,6 +2561,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } return true; } + /** * @brief Api to dispatch a kernel for execution. The implementation * parses the input object, an instance of virtual command to obtain @@ -2385,10 +2571,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const * It also parses the kernel arguments buffer to inject into Hsa Runtime * the list of kernel parameters. */ + // ================================================================================================ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { if (vcmd.cooperativeGroups() || vcmd.cooperativeMultiDeviceGroups()) { // Wait for the execution on the current queue, since the coop groups will use the device queue - releaseGpuMemoryFence(); + releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait); // Get device queue for exclusive GPU access VirtualGPU* queue = dev().xferQueue(); @@ -2398,6 +2585,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { queue->profilingBegin(vcmd); + // Add a dependency into the device queue on the current queue + queue->Barriers().SetExternalSignal(Barriers().GetLastSignal()); + if (vcmd.cooperativeGroups()) { // Initialize GWS if it's cooperative groups launch uint32_t workgroups = 0; @@ -2420,7 +2610,11 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { vcmd.setStatus(CL_INVALID_OPERATION); } // Wait for the execution on the device queue. Keep the current queue in-order - queue->releaseGpuMemoryFence(); + queue->releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait); + + // Add a dependency into the current queue on the coop queue + Barriers().SetExternalSignal(queue->Barriers().GetLastSignal()); + hasPendingDispatch_ = true; queue->profilingEnd(vcmd); } else { @@ -2440,6 +2634,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { } } +// ================================================================================================ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) { // std::cout<<__FUNCTION__<<" not implemented"<<"*********"< 0) ? true : false; } + const bool HwProfiling() const { return !signals_.empty(); } - void setAgent(hsa_agent_t agent) { agent_ = agent; } - - Timestamp() + Timestamp(hsa_agent_t agent) : start_(std::numeric_limits::max()) - , end_(0) { - agent_.handle = 0; - } + , end_(0) + , agent_(agent) {} ~Timestamp() {} //! Finds execution ticks on GPU - void checkGpuTime() { - if (HwProfiling()) { - hsa_amd_profiling_dispatch_time_t time = {}; - - uint64_t start = std::numeric_limits::max(); - uint64_t end = 0; - for (auto it : signals_) { - if (hsa_signal_load_relaxed(it->signal_) > 0) { - WaitForSignal(it->signal_); - } - hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time); - if ((time.end - time.start) == 0) { - hsa_amd_profiling_async_copy_time_t time_sdma = {}; - hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma); - time.start = time_sdma.start; - time.end = time_sdma.end; - } - start = std::min(time.start, start); - end = std::max(time.end, end); - it->ts_ = nullptr; - it->done_ = true; - } - signals_.clear(); - start_ = start * ticksToTime_; - end_ = end * ticksToTime_; - } - } + void checkGpuTime(); // Start a timestamp (get timestamp from OS) void start() { start_ = amd::Os::timeNanos(); } @@ -183,113 +154,54 @@ class VirtualGPU : public device::VirtualDevice { class HwQueueTracker : public amd::EmbeddedObject { public: - HwQueueTracker() {} + HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu) {} - ~HwQueueTracker() { - for (auto& signal: signal_list_) { - if (signal->signal_.handle != 0) { - hsa_signal_destroy(signal->signal_); - } - delete signal; - } - } + ~HwQueueTracker(); //! Creates a pool of signals for tracking of HW operations on the queue - bool Create(hsa_agent_t agent) { - constexpr size_t kSignalListSize = 16; - signal_list_.resize(kSignalListSize); - for (uint i = 0; i < kSignalListSize; ++i) { - ProfilingSignal* signal = new ProfilingSignal(); - if ((signal == nullptr) || (HSA_STATUS_SUCCESS != hsa_signal_create( - 0, 1, &agent, &signal->signal_))) { - return false; - } - signal_list_[i] = signal; - } - agent_ = agent; - return true; - } + bool Create(); //! Finds a free signal for the upcomming operation hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne, - Timestamp* ts = nullptr, uint32_t queue_size = 0) { - // If queue size grows, then add more signals to avoid more frequent stalls - if (queue_size > signal_list_.size()) { - ProfilingSignal* signal = new ProfilingSignal(); - if (signal != nullptr) { - if (HSA_STATUS_SUCCESS == hsa_signal_create( - 0, 1, &agent_, &signal->signal_)) { - signal_list_.push_back(signal); - } - } - } - // Find valid index - ++current_id_ %= signal_list_.size(); - - // Make sure the previous operation on the current signal is done - WaitCurrent(); - - // Have to wait the next signal in the queue to avoid a race condition between - // a GPU waiter(which may be not triggered yet) and CPU signal reset below - WaitNext(); - - // Reset the signal and return - hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val); - signal_list_[current_id_]->done_ = false; - if (ts != 0) { - if (!sdma_profiling_) { - hsa_amd_profiling_async_copy_enable(true); - sdma_profiling_ = true; - } - signal_list_[current_id_]->ts_ = ts; - ts->AddProfilingSignal(signal_list_[current_id_]); - ts->setAgent(agent_); - } - return signal_list_[current_id_]->signal_; - } + Timestamp* ts = nullptr, uint32_t queue_size = 0); //! Wait for the curent active signal. Can idle the queue - bool WaitCurrent() { return WaitIndex(current_id_); } + bool WaitCurrent() { return CpuWaitForSignal(signal_list_[current_id_]); } + + //! Update current active engine + void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; } //! Returns the last submitted signal for a wait - hsa_signal_t WaitSignal() { - //! @note Currently wait on CPU unconditionally to avoid a negative performance impact - WaitCurrent(); - return hsa_signal_t{}; - } + hsa_signal_t* WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute); //! Resets current signal back to the previous one. It's necessary in a case of ROCr failure. void ResetCurrentSignal(); - private: + //! Inserts an external signal(submission in another queue) for dependency tracking + void SetExternalSignal(ProfilingSignal* signal) { + external_signal_ = signal; + engine_ = HwQueueEngine::External; + } + + //! Inserts an external signal(submission in another queue) for dependency tracking + ProfilingSignal* GetLastSignal() const { return signal_list_[current_id_]; } + + private: //! Wait for the next active signal void WaitNext() { size_t next = (current_id_ + 1) % signal_list_.size(); - WaitIndex(next); + CpuWaitForSignal(signal_list_[next]); } //! Wait for the provided signal - bool WaitIndex(size_t index) { - // Wait for the current signal - if (!signal_list_[index]->done_) { - // Update timestamp values if requested - if (signal_list_[index]->ts_ != nullptr) { - signal_list_[index]->ts_->checkGpuTime(); - } else { - if (!WaitForSignal(signal_list_[index]->signal_)) { - LogPrintfError("Failed signal [0x%lx] wait", signal_list_[index]->signal_); - return false; - } - signal_list_[index]->done_ = true; - } - } - return true; - } + bool CpuWaitForSignal(ProfilingSignal* signal); - std::vector signal_list_; //!< The pool of all signals for processing - size_t current_id_ = 0; //!< Last submitted signal - hsa_agent_t agent_; //!< HSA device agent - bool sdma_profiling_ = false; //!< Don't enable SDMA profiling by default + HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations + std::vector signal_list_; //!< The pool of all signals for processing + ProfilingSignal* external_signal_ = nullptr; //!< Dependency on external signal + size_t current_id_ = 0; //!< Last submitted signal + bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled + const VirtualGPU& gpu_; //!< VirtualGPU, associated with this tracker }; VirtualGPU(Device& device, bool profiling = false, bool cooperative = false, @@ -358,7 +270,7 @@ class VirtualGPU : public device::VirtualDevice { */ bool releaseGpuMemoryFence(bool force_barrier = false, bool skip_copy_wait = false); - hsa_agent_t gpu_device() { return gpu_device_; } + hsa_agent_t gpu_device() const { return gpu_device_; } hsa_queue_t* gpu_queue() { return gpu_queue_; } // Return pointer to PrintfDbg diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index ed9828778d..fe1b139f13 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -244,6 +244,12 @@ release(bool, ROC_ACTIVE_WAIT, false, \ "Forces unconditional active wait for GPU") \ release(bool, ROC_ENABLE_LARGE_BAR, true, \ "Enable Large Bar if supported by the device") \ +release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true, \ + "Enable CPU wait for dependent HSA signals.") \ +release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true, \ + "Enable system scope for signals (uses interrupts).") \ +release(bool, ROC_SKIP_COPY_SYNC, false, \ + "Skips copy syncs if runtime can predict the same engine.") \ release(bool, HIP_FORCE_QUEUE_PROFILING, false, \ "Force command queue profiling by default") \ release(uint, PAL_FORCE_ASIC_REVISION, 0, \