diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 1e026c587e..4ae1126449 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -73,6 +73,11 @@ static constexpr uint16_t kBarrierPacketHeader = (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); +static constexpr uint16_t kNopPacketHeader = + (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + static constexpr uint16_t kBarrierPacketAcquireHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | @@ -596,7 +601,6 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, return false; } - void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) { assert(packet->completion_signal.handle != 0); const uint32_t queueSize = gpu_queue_->size; @@ -627,6 +631,68 @@ void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) { packet->dep_signal[3], packet->dep_signal[4], packet->completion_signal); } +void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet, + uint16_t packetHeader, hsa_signal_t signal) { + const uint32_t queueSize = gpu_queue_->size; + const uint32_t queueMask = queueSize - 1; + uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1); + uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); + + if (signal.handle == 0) { + // Pool size must grow to the size of pending AQL packets + const uint32_t pool_size = index - read; + if (pool_size >= signal_pool_.size()) { + ProfilingSignal profiling_signal = {}; + if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) { + LogPrintfError("Failed signal allocation id = %d", pool_size); + } + signal_pool_.push_back(profiling_signal); + assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!"); + } + // Move index inside the valid pool + ++current_signal_ %= signal_pool_.size(); + // Find signal slot + ProfilingSignal* profilingSignal = &signal_pool_[current_signal_]; + // Make sure we save the old results in the TS structure + if (profilingSignal->ts_ != nullptr) { + profilingSignal->ts_->checkGpuTime(); + } + if (timestamp_ != nullptr) { + // Update the new TS with the signal info + timestamp_->setProfilingSignal(profilingSignal); + profilingSignal->ts_ = timestamp_; + timestamp_->setAgent(gpu_device_); + } + packet->completion_signal = profilingSignal->signal_; + hsa_signal_store_relaxed(profilingSignal->signal_, kInitSignalValueOne); + } else { + assert(signal.handle != 0); + packet->completion_signal = signal; + } + + while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask); + hsa_barrier_and_packet_t* aql_loc = + &(reinterpret_cast(gpu_queue_->base_address))[index & queueMask]; + *aql_loc = *packet; + __atomic_store_n(reinterpret_cast(aql_loc), packetHeader, __ATOMIC_RELEASE); + + hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index); + ClPrint(amd::LOG_DEBUG, amd::LOG_AQL, + "[%zx] HWq=0x%zx, BarrierAND Header = 0x%x (type=%d, barrier=%d, acquire=%d, release=%d), " + "dep_signal=[0x%zx, 0x%zx, 0x%zx, 0x%zx, 0x%zx], completion_signal=0x%zx", + std::this_thread::get_id(), gpu_queue_, packetHeader, + extractAqlBits(packetHeader, HSA_PACKET_HEADER_TYPE, + HSA_PACKET_HEADER_WIDTH_TYPE), + extractAqlBits(packetHeader, HSA_PACKET_HEADER_BARRIER, + HSA_PACKET_HEADER_WIDTH_BARRIER), + extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), + extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), + packet->dep_signal[0], packet->dep_signal[1], packet->dep_signal[2], + packet->dep_signal[3], packet->dep_signal[4], packet->completion_signal); +} + // ================================================================================================ void VirtualGPU::ResetQueueStates() { // Release all transfer buffers on this command queue @@ -934,7 +1000,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { return; } // Without barrier profiling will wait for each individual signal - timestamp_ = new Timestamp(!dev().settings().barrier_sync_); + timestamp_ = new Timestamp(); timestamp_->start(); } } @@ -2505,8 +2571,22 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) { // std::cout<<__FUNCTION__<<" not implemented"<<"*********"< splittedSignals_; - bool wait_for_signal_; //!< Wait for signal before gathering the timestamp values public: uint64_t getStart() { @@ -96,12 +95,11 @@ class Timestamp { void setAgent(hsa_agent_t agent) { agent_ = agent; } - Timestamp(bool wait_for_signal = false) + Timestamp() : start_(0) , end_(0) , profilingSignal_(nullptr) - , splittedDispatch_(false) - , wait_for_signal_(wait_for_signal) { + , splittedDispatch_(false) { agent_.handle = 0; } @@ -116,7 +114,7 @@ class Timestamp { uint64_t start = UINT64_MAX; uint64_t end = 0; for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) { - if (wait_for_signal_) { + if (hsa_signal_load_relaxed(profilingSignal_->signal_) > 0) { WaitForSignal(*it); } hsa_amd_profiling_get_dispatch_time(agent_, *it, &time); @@ -130,7 +128,8 @@ class Timestamp { start_ = start * ticksToTime_; end_ = end * ticksToTime_; } else { - if (wait_for_signal_) { + // If the signalValue is the same as initial set value, it means its not written to + if (hsa_signal_load_relaxed(profilingSignal_->signal_) > 0) { WaitForSignal(profilingSignal_->signal_); } hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time); @@ -298,12 +297,22 @@ class VirtualGPU : public device::VirtualDevice { // } roc OpenCL integration private: - bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true); - bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true); - template bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, size_t size = 1); + bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, + uint16_t rest, bool blocking = true); + bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, + uint16_t rest, bool blocking = true); + template bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, + uint16_t rest, bool blocking, + size_t size = 1); void dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet); - bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion, bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi); - void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet, amd::NDRangeContainer& sizes); + void dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet, uint16_t packetHeader, + hsa_signal_t signal); + void dispatchBarrierPacket(hsa_barrier_and_packet_t* packet, uint16_t packetHeader, + hsa_signal_t signal); + bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion, + bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi); + void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet, + amd::NDRangeContainer& sizes); bool initPool(size_t kernarg_pool_size, uint signal_pool_count); void destroyPool(); diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index 8b5d1d8bdb..fbbb9afe74 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -97,7 +97,7 @@ class Event : public RuntimeObject { static const EventWaitList nullWaitList; struct ProfilingInfo { - ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0) { + ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(false) { if (enabled) { clear(); callback_ = nullptr; @@ -111,6 +111,7 @@ class Event : public RuntimeObject { bool enabled_; //!< Profiling enabled for the wave limiter uint32_t waves_; //!< The number of waves used in a dispatch ProfilingCallback* callback_; + bool marker_ts_; void clear() { queued_ = 0ULL; submitted_ = 0ULL;