From cb9a715e04a404c0f1a42e8da52a148f5effabb0 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Wed, 15 Nov 2023 04:27:56 +0000 Subject: [PATCH] SWDEV-422207 - Report kernel names for activity profiling - Report kernel names for optimized graph path - Refactor code so that we store profiling info in Accumulate command Change-Id: Ib97735a0239aeb9fc3a50a4bb7126dd0bcadc8af [ROCm/clr commit: b05668660777d6f530ff928b80153292ebff1075] --- .../clr/hipamd/src/hip_graph_internal.cpp | 3 ++ .../clr/hipamd/src/hip_graph_internal.hpp | 11 ++++- .../clr/rocclr/device/rocm/rocvirtual.cpp | 6 +-- projects/clr/rocclr/platform/activity.cpp | 8 +++- projects/clr/rocclr/platform/command.hpp | 48 +++++++++++++------ 5 files changed, 55 insertions(+), 21 deletions(-) diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index d4d07c4753..6886bc03c6 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -551,6 +551,7 @@ hipError_t GraphExec::Run(hipStream_t stream) { for (int i = 0; i < topoOrder_.size() - 1; i++) { if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) { hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate); + accumulate->addKernelName(topoOrder_[i]->GetKernelName()); } else { topoOrder_[i]->SetStream(hip_stream, this); status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue()); @@ -561,6 +562,8 @@ hipError_t GraphExec::Run(hipStream_t stream) { // If last captured packet is kernel, optimize to detect completion of last kernel // This saves on extra packet submitted to determine end of graph if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) { + // Add the last kernel node name to the accumulate command + accumulate->addKernelName(topoOrder_.back()->GetKernelName()); accumulate->enqueue(); accumulate->release(); isLastPacketKernel = true; diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index 27892301c4..61bab5bb49 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -183,6 +183,7 @@ struct GraphNode : public hipGraphNodeDOTAttribute { static amd::Monitor nodeSetLock_; unsigned int isEnabled_; uint8_t gpuPacket_[64]; //!< GPU Packet to enqueue during graph launch + std::string capturedKernelName_; public: GraphNode(hipGraphNodeType type, std::string style = "", std::string shape = "", @@ -232,7 +233,10 @@ struct GraphNode : public hipGraphNodeDOTAttribute { } // Return gpu packet address to update with actual packet under capture. uint8_t* GetAqlPacket() { return gpuPacket_; } - hip::Stream* GetQueue() { return stream_; } + void SetKernelName(std::string kernelName) { capturedKernelName_ = kernelName; } + const std::string& GetKernelName() const { return capturedKernelName_; } + + hip::Stream* GetQueue() const { return stream_; } virtual void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) { stream_ = stream; @@ -805,8 +809,11 @@ class GraphKernelNode : public GraphNode { reinterpret_cast(command)->setCapturingState( true, GetAqlPacket(), kernArgOffset); - // Enqueue command to capture GPU Packet. Packet is not sent to hardware queue. + // Enqueue command to capture GPU Packet. The packet is not submitted to the device. + // The packet is stored in gpuPacket_ and submitted during graph launch. command->submit(*(command->queue())->vdev()); + // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes + SetKernelName(reinterpret_cast(command)->kernel().name()); command->release(); } } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 652e195838..09674b96fd 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -135,7 +135,7 @@ void Timestamp::checkGpuTime() { // Avoid profiling data for the sync barrier, in tiny performance tests the first call // to ROCr is very slow and that also affects the overall performance of the callback thread if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_ - || command().profilingInfo().multiple_ts_) { + || command().type() == CL_COMMAND_TASK) { hsa_amd_profiling_dispatch_time_t time = {}; if (it->engine_ == HwQueueEngine::Compute) { hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time); @@ -149,8 +149,8 @@ void Timestamp::checkGpuTime() { start = std::min(time.start, start); end = std::max(time.end, end); - if (command().profilingInfo().multiple_ts_) { - command().AddTimeStamps(time.start, time.end); + if (command().type() == CL_COMMAND_TASK) { + static_cast(command()).addTimestamps(time.start, time.end); } ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, " diff --git a/projects/clr/rocclr/platform/activity.cpp b/projects/clr/rocclr/platform/activity.cpp index bbe142202d..e90c5a7b08 100644 --- a/projects/clr/rocclr/platform/activity.cpp +++ b/projects/clr/rocclr/platform/activity.cpp @@ -101,10 +101,14 @@ void ReportActivity(const amd::Command& command) { break; } - if (command.profilingInfo().tsList_.size() > 0) { - for (auto& it : command.profilingInfo().tsList_) { + if (command.type() == CL_COMMAND_TASK) { + auto timestamps = static_cast(command).getTimestamps(); + for (uint32_t i = 0; i < timestamps.size(); i++) { + auto it = timestamps[i]; record.begin_ns = it.first; record.end_ns = it.second; + record.kernel_name = + static_cast(command).getKernelNames()[i].c_str(); function(ACTIVITY_DOMAIN_HIP_OPS, operation_id, &record); } } else { diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index beb852ec9a..3fda3eee69 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -105,7 +105,7 @@ class Event : public RuntimeObject { struct ProfilingInfo { ProfilingInfo(bool enabled = false) - : enabled_(enabled), marker_ts_(false), multiple_ts_(false) { + : enabled_(enabled), marker_ts_(false) { if (enabled) { clear(); correlation_id_ = activity_prof::correlation_id; @@ -116,20 +116,17 @@ class Event : public RuntimeObject { uint64_t submitted_; uint64_t start_; uint64_t end_; - std::vector> tsList_; uint64_t correlation_id_; bool enabled_; //!< Profiling enabled for the wave limiter bool marker_ts_; //!< TS marker - bool multiple_ts_; //!< Multiple TS void clear() { queued_ = 0ULL; submitted_ = 0ULL; start_ = 0ULL; end_ = 0ULL; - tsList_.clear(); - } + } } profilingInfo_; //! Construct a new event. @@ -225,11 +222,6 @@ class Event : public RuntimeObject { //! Set release scope for the event void setEventScope(int32_t scope) { event_scope_ = scope; } - - //! Add a timestamp to the list - void AddTimeStamps(uint64_t start, uint64_t end) { - profilingInfo_.tsList_.push_back(std::make_pair(start, end)); - } }; union CopyMetadata { @@ -1263,17 +1255,45 @@ class Marker : public Command { class AccumulateCommand : public Command { private: uint8_t* lastPacket_; + + //! Kernel names and timestamps list for activity profiling + std::vector kernelNames_; + std::vector> tsList_; + public: //! Create a new Marker AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList, const Event* waitingEvent = nullptr, uint8_t* lastPacket = nullptr) : Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent), lastPacket_(lastPacket) - { - profilingInfo_.multiple_ts_ = true; - } - // Return last packet + {} + //! Return last packet uint8_t* getLastPacket() const { return lastPacket_; } + + //! Add kernel name to the list if available + void addKernelName(const std::string& kernelName) { + if (activity_prof::IsEnabled(OP_ID_DISPATCH)) { + kernelNames_.push_back(kernelName); + } + } + + //! Add kernel timestamp to the list if available + void addTimestamps(uint64_t startTs, uint64_t endTs) { + if (activity_prof::IsEnabled(OP_ID_DISPATCH)) { + tsList_.push_back(std::make_pair(startTs, endTs)); + } + } + + //! Return the kernel names + const std::vector& getKernelNames() const { + return kernelNames_; + } + + //! Return the kernel timestamps + const std::vector>& getTimestamps() const { + return tsList_; + } + //! The command implementation virtual void submit(device::VirtualDevice& device) { device.submitAccumulate(*this);