From 1410f002f3c8c194cc5f2b950e5ea96ffbc80b15 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Sat, 17 Oct 2020 10:04:58 -0700 Subject: [PATCH] Use barrier packets for event profiling Use barrier packets for every profile marker that gets submitted and use the completion signal to get GPU ts. This gives most accurate dispatch time. Club cache flushes with profile marker if there is a pending dispatch that needs cache flush. This optimization saves on extra barrier and helps wall time Change-Id: Ib62d6d7aabf4743827b561be6c9c5afa813203da [ROCm/clr commit: 59c6cb0268a34bdf9e73e78c5fdf8efde9bf68a2] --- .../clr/rocclr/device/rocm/rocvirtual.cpp | 88 ++++++++++++++++++- .../clr/rocclr/device/rocm/rocvirtual.hpp | 31 ++++--- projects/clr/rocclr/platform/command.hpp | 3 +- 3 files changed, 106 insertions(+), 16 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 1e026c587e..4ae1126449 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -73,6 +73,11 @@ static constexpr uint16_t kBarrierPacketHeader = (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); +static constexpr uint16_t kNopPacketHeader = + (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + static constexpr uint16_t kBarrierPacketAcquireHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | @@ -596,7 +601,6 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, return false; } - void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) { assert(packet->completion_signal.handle != 0); const uint32_t queueSize = gpu_queue_->size; @@ -627,6 +631,68 @@ void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) { packet->dep_signal[3], packet->dep_signal[4], packet->completion_signal); } +void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet, + uint16_t packetHeader, hsa_signal_t signal) { + const uint32_t queueSize = gpu_queue_->size; + const uint32_t queueMask = queueSize - 1; + uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); + uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); + + if (signal.handle == 0) { + // Pool size must grow to the size of pending AQL packets + const uint32_t pool_size = index - read; + if (pool_size >= signal_pool_.size()) { + ProfilingSignal profiling_signal = {}; + if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) { + LogPrintfError("Failed signal allocation id = %d", pool_size); + } + signal_pool_.push_back(profiling_signal); + assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!"); + } + // Move index inside the valid pool + ++current_signal_ %= signal_pool_.size(); + // Find signal slot + ProfilingSignal* profilingSignal = &signal_pool_[current_signal_]; + // Make sure we save the old results in the TS structure + if (profilingSignal->ts_ != nullptr) { + profilingSignal->ts_->checkGpuTime(); + } + if (timestamp_ != nullptr) { + // Update the new TS with the signal info + timestamp_->setProfilingSignal(profilingSignal); + profilingSignal->ts_ = timestamp_; + timestamp_->setAgent(gpu_device_); + } + packet->completion_signal = profilingSignal->signal_; + hsa_signal_store_relaxed(profilingSignal->signal_, kInitSignalValueOne); + } else { + assert(signal.handle != 0); + packet->completion_signal = signal; + } + + while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); + hsa_barrier_and_packet_t* aql_loc = + &(reinterpret_cast(gpu_queue_->base_address))[index & queueMask]; + *aql_loc = *packet; + __atomic_store_n(reinterpret_cast(aql_loc), packetHeader, __ATOMIC_RELEASE); + + hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index); + ClPrint(amd::LOG_DEBUG, amd::LOG_AQL, + "[%zx] HWq=0x%zx, BarrierAND Header = 0x%x (type=%d, barrier=%d, acquire=%d, release=%d), " + "dep_signal=[0x%zx, 0x%zx, 0x%zx, 0x%zx, 0x%zx], completion_signal=0x%zx", + std::this_thread::get_id(), gpu_queue_, packetHeader, + extractAqlBits(packetHeader, HSA_PACKET_HEADER_TYPE, + HSA_PACKET_HEADER_WIDTH_TYPE), + extractAqlBits(packetHeader, HSA_PACKET_HEADER_BARRIER, + HSA_PACKET_HEADER_WIDTH_BARRIER), + extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), + extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), + packet->dep_signal[0], packet->dep_signal[1], packet->dep_signal[2], + packet->dep_signal[3], packet->dep_signal[4], packet->completion_signal); +} + // ================================================================================================ void VirtualGPU::ResetQueueStates() { // Release all transfer buffers on this command queue @@ -934,7 +1000,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { return; } // Without barrier profiling will wait for each individual signal - timestamp_ = new Timestamp(!dev().settings().barrier_sync_); + timestamp_ = new Timestamp(); timestamp_->start(); } } @@ -2505,8 +2571,22 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) { // std::cout<<__FUNCTION__<<" not implemented"<<"*********"< splittedSignals_; - bool wait_for_signal_; //!< Wait for signal before gathering the timestamp values public: uint64_t getStart() { @@ -96,12 +95,11 @@ class Timestamp { void setAgent(hsa_agent_t agent) { agent_ = agent; } - Timestamp(bool wait_for_signal = false) + Timestamp() : start_(0) , end_(0) , profilingSignal_(nullptr) - , splittedDispatch_(false) - , wait_for_signal_(wait_for_signal) { + , splittedDispatch_(false) { agent_.handle = 0; } @@ -116,7 +114,7 @@ class Timestamp { uint64_t start = UINT64_MAX; uint64_t end = 0; for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) { - if (wait_for_signal_) { + if (hsa_signal_load_relaxed(profilingSignal_->signal_) > 0) { WaitForSignal(*it); } hsa_amd_profiling_get_dispatch_time(agent_, *it, &time); @@ -130,7 +128,8 @@ class Timestamp { start_ = start * ticksToTime_; end_ = end * ticksToTime_; } else { - if (wait_for_signal_) { + // If the signalValue is the same as initial set value, it means its not written to + if (hsa_signal_load_relaxed(profilingSignal_->signal_) > 0) { WaitForSignal(profilingSignal_->signal_); } hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time); @@ -298,12 +297,22 @@ class VirtualGPU : public device::VirtualDevice { // } roc OpenCL integration private: - bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true); - bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true); - template bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, size_t size = 1); + bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, + uint16_t rest, bool blocking = true); + bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, + uint16_t rest, bool blocking = true); + template bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, + uint16_t rest, bool blocking, + size_t size = 1); void dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet); - bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion, bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi); - void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet, amd::NDRangeContainer& sizes); + void dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet, uint16_t packetHeader, + hsa_signal_t signal); + void dispatchBarrierPacket(hsa_barrier_and_packet_t* packet, uint16_t packetHeader, + hsa_signal_t signal); + bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion, + bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi); + void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet, + amd::NDRangeContainer& sizes); bool initPool(size_t kernarg_pool_size, uint signal_pool_count); void destroyPool(); diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index 8b5d1d8bdb..fbbb9afe74 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -97,7 +97,7 @@ class Event : public RuntimeObject { static const EventWaitList nullWaitList; struct ProfilingInfo { - ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0) { + ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(false) { if (enabled) { clear(); callback_ = nullptr; @@ -111,6 +111,7 @@ class Event : public RuntimeObject { bool enabled_; //!< Profiling enabled for the wave limiter uint32_t waves_; //!< The number of waves used in a dispatch ProfilingCallback* callback_; + bool marker_ts_; void clear() { queued_ = 0ULL; submitted_ = 0ULL;