From 40f41f4d0bb778bce6edce634164c2614d979465 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Thu, 26 Oct 2023 20:06:18 +0000 Subject: [PATCH] SWDEV-422207 - Track commands for capture - Track all captured commands under a new AccumulateCommand - Add begin() and end() methods to capture commands - Explicit TS object now passed to certain methods because profilingBegin() and profilingEnd() now happen separately and thus can run into threading issues Change-Id: I171106bdcad72b057836cb2f3fc398db3533119f --- hipamd/src/hip_graph_internal.cpp | 16 +++--- rocclr/device/device.hpp | 6 ++- rocclr/device/pal/palvirtual.cpp | 8 ++- rocclr/device/pal/palvirtual.hpp | 4 +- rocclr/device/rocm/rocvirtual.cpp | 89 ++++++++++++++++++++----------- rocclr/device/rocm/rocvirtual.hpp | 10 ++-- rocclr/platform/command.cpp | 2 +- rocclr/platform/command.hpp | 31 ++++++++--- 8 files changed, 110 insertions(+), 56 deletions(-) diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp index 884b4cdf6b..631e54ec80 100644 --- a/hipamd/src/hip_graph_internal.cpp +++ b/hipamd/src/hip_graph_internal.cpp @@ -529,9 +529,14 @@ hipError_t GraphExec::Run(hipStream_t stream) { } if (parallelLists_.size() == 1) { + amd::AccumulateCommand* accumulate = nullptr; + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { + accumulate = new amd::AccumulateCommand(*hip_stream); + } + for (int i = 0; i < topoOrder_.size(); i++) { if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) { - hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket()); + hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate); } else { topoOrder_[i]->SetStream(hip_stream, this); status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue()); @@ -540,13 +545,8 @@ hipError_t GraphExec::Run(hipStream_t stream) { } if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { - amd::Command* endCommand = nullptr; - endCommand = new amd::Marker(*hip_stream, false); - // Since the end command is for graph completion tracking, - // it may not need release scopes - endCommand->setEventScope(amd::Device::kCacheStateIgnore); - endCommand->enqueue(); - endCommand->release(); + accumulate->enqueue(); + accumulate->release(); } } else { UpdateStream(parallelLists_, hip_stream, this); diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index c1459d24cf..c3662a9982 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -79,6 +79,7 @@ class PerfCounterCommand; class ReleaseObjectCommand; class StallQueueCommand; class Marker; +class AccumulateCommand; class ThreadTraceCommand; class ThreadTraceMemObjectsCommand; class SignalCommand; @@ -1248,6 +1249,7 @@ class VirtualDevice : public amd::HeapObject { virtual void submitKernel(amd::NDRangeKernelCommand& command) = 0; virtual void submitNativeFn(amd::NativeFnCommand& cmd) = 0; virtual void submitMarker(amd::Marker& cmd) = 0; + virtual void submitAccumulate(amd::AccumulateCommand& cmd) = 0; virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) = 0; virtual void submitFillMemory(amd::FillMemoryCommand& cmd) = 0; virtual void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd) = 0; @@ -1287,7 +1289,9 @@ class VirtualDevice : public amd::HeapObject { //! Returns fence state of the VirtualGPU virtual bool isFenceDirty() const = 0; - virtual bool dispatchAqlPacket(uint8_t* aqlpacket) = 0; + + //! Dispatch captured AQL packet + virtual bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr) = 0; //! Resets fence state of the VirtualGPU virtual void resetFenceDirty() = 0; diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp index 7a095f0117..96a199d9eb 100644 --- a/rocclr/device/pal/palvirtual.cpp +++ b/rocclr/device/pal/palvirtual.cpp @@ -2709,6 +2709,7 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) { Unimplemented(); //!< @todo: Unimplemented } +// ================================================================================================ void VirtualGPU::submitMarker(amd::Marker& vcmd) { //!@note runtime doesn't need to lock this command on execution @@ -2735,6 +2736,11 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) { } } +// ================================================================================================ +void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) { +} + +// ================================================================================================ void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) { const Pal::IQueueSemaphore* sem = reinterpret_cast(cmd.sem_ptr()); @@ -2748,10 +2754,8 @@ void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) { queues_[MainEngine]->iQueue_->WaitQueueSemaphore(const_cast(sem), cmd.fence()); } - } - void VirtualGPU::releaseMemory(GpuMemoryReference* mem) { queues_[MainEngine]->removeCmdMemRef(mem); if (!dev().settings().disableSdma_) { diff --git a/rocclr/device/pal/palvirtual.hpp b/rocclr/device/pal/palvirtual.hpp index dbed5809db..808e6c2616 100644 --- a/rocclr/device/pal/palvirtual.hpp +++ b/rocclr/device/pal/palvirtual.hpp @@ -318,6 +318,7 @@ class VirtualGPU : public device::VirtualDevice { void submitFillMemory(amd::FillMemoryCommand& vcmd); void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); void submitMarker(amd::Marker& vcmd); + void submitAccumulate(amd::AccumulateCommand& vcmd); void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd); void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd); void submitPerfCounter(amd::PerfCounterCommand& vcmd); @@ -342,7 +343,8 @@ class VirtualGPU : public device::VirtualDevice { bool isFenceDirty() const { return false; } - inline bool dispatchAqlPacket(uint8_t* aqlpacket) { return false; } + inline bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr) { + return false; } void resetFenceDirty() {} diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index e8a75e9df4..16706003f8 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2013 - 2022 Advanced Micro Devices, Inc. +/* Copyright (c) 2013 - 2023 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -127,7 +127,6 @@ void Timestamp::checkGpuTime() { for (auto it : signals_) { amd::ScopedLock lock(it->LockSignalOps()); - // Ignore the wait if runtime processes API callback, because the signal value is bigger // than expected and the value reset will occur after API callback is done if (GetCallbackSignal().handle == 0) { @@ -149,7 +148,8 @@ void Timestamp::checkGpuTime() { start = std::min(time.start, start); end = std::max(time.end, end); ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, " - "end = %ld time taken= %ld ns", it->signal_.handle, start, end, end - start); + "end = %ld time taken= %ld ns", it->signal_.handle, time.start, time.end, + time.end - time.start); } it->flags_.done_ = true; } @@ -848,6 +848,13 @@ bool VirtualGPU::dispatchGenericAqlPacket( if (timestamp_ != nullptr) { // Get active signal for current dispatch if profiling is necessary packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); + + // If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can + // retrieve this correlation ID to attribute waves to specific dispatch locations. + if (std::is_same::value) { + auto dispatchPacket = reinterpret_cast(packet); + dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_; + } } // Make sure the slot is free for usage @@ -954,6 +961,24 @@ bool VirtualGPU::dispatchAqlPacket( return dispatchGenericAqlPacket(packet, header, rest, blocking); } +// ================================================================================================ +inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd) { + amd::ScopedLock lock(execution()); + if (vcmd != nullptr) { + profilingBegin(*vcmd, true, true); + } + dispatchBlockingWait(); + auto packet = reinterpret_cast(aqlpacket); + + constexpr size_t kPacketSize = 1; + Timestamp* ts = reinterpret_cast(vcmd->data()); + dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize); + if (vcmd != nullptr) { + profilingEnd(*vcmd, true); + } + return true; +} + // ================================================================================================ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion, bool blocking, @@ -1056,21 +1081,6 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal, barrier_packet_.dep_signal[4] = hsa_signal_t{}; } -inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket) { - dispatchBlockingWait(); - auto packet = reinterpret_cast(aqlpacket); - // If rocprof tracing is enabled, store the correlation ID in the dispatch packet. - // The profiler can retrieve this correlation ID to attribute waves to specific dispatch - // locations. - if (activity_prof::IsEnabled(OP_ID_DISPATCH) || profiling_) { - packet->reserved2 = activity_prof::correlation_id; - // Get active signal for current dispatch if profiling is necessary - packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); - } - dispatchGenericAqlPacket(packet, packet->header, packet->setup, false); - return true; -} - // ================================================================================================ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveDepSignal, hsa_signal_t signal, hsa_signal_value_t value, @@ -1430,17 +1440,23 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) { * virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data * and then calls start() to get the current host timestamp. */ -void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) { +void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool useCommandTs) { if (command.profilingInfo().enabled_) { if (timestamp_ != nullptr) { LogWarning("Trying to create a second timestamp in VirtualGPU. \ This could have unintended consequences."); return; } - // Without barrier profiling will wait for each individual signal - timestamp_ = new Timestamp(this, command); - command.setData(timestamp_); - timestamp_->start(); + Timestamp* ts = useCommandTs ? reinterpret_cast(command.data()) : timestamp_; + + if (ts == nullptr) { + // Without barrier profiling will wait for each individual signal + timestamp_ = new Timestamp(this, command); + command.setData(timestamp_); + timestamp_->start(); + } else { + timestamp_ = ts; + } // Enable SDMA profiling on the first access if profiling is set // Its not per command basis @@ -1473,10 +1489,11 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) { * created for whatever command we are running and calls end() to get the * current host timestamp if no signal is available. */ -void VirtualGPU::profilingEnd(amd::Command& command) { +void VirtualGPU::profilingEnd(amd::Command& command, bool useCommandTs) { if (command.profilingInfo().enabled_) { - if (timestamp_->HwProfiling() == false) { - timestamp_->end(); + Timestamp* ts = useCommandTs ? reinterpret_cast(command.data()) : timestamp_; + if (ts->HwProfiling() == false) { + ts->end(); } timestamp_ = nullptr; } @@ -3238,11 +3255,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, addSystemScope_ = true; } - // If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can - // retrieve this correlation ID to attribute waves to specific dispatch locations. - if (vcmd != nullptr && vcmd->profilingInfo().enabled_) { - dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_; - } // Copy scheduler's AQL packet for possible relaunch from the scheduler itself if (aql_packet != nullptr) { @@ -3374,7 +3386,6 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { // ================================================================================================ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) { - // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<vdev()->execution()); queue_->FormSubmissionBatch(this); - if (type() == CL_COMMAND_MARKER || type() == 0) { + if (type() == CL_COMMAND_MARKER || type() == 0 || type() == CL_COMMAND_TASK) { // The current HSA signal tracking logic requires profiling enabled for the markers EnableProfiling(); // Update batch head for the current marker. Hence the status of all commands can be diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index 6861546951..ece4a21a0b 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -269,7 +269,8 @@ class Command : public Event { uint32_t commandWaitBits_; //! Construct a new command of the given OpenCL type. - Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList = nullWaitList, + Command(HostQueue& queue, cl_command_type type, + const EventWaitList& eventWaitList = nullWaitList, uint32_t commandWaitBits = 0, const Event* waitingEvent = nullptr); //! Construct a new command of the given OpenCL type. @@ -857,7 +858,7 @@ class CopyMemoryCommand : public TwoMemoryArgsCommand { : TwoMemoryArgsCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory), srcOrigin_(srcOrigin), dstOrigin_(dstOrigin), - size_(size), + size_(size), copyMetadata_(copyMetadata){ // Sanity checks assert(size.c[0] > 0 && "invalid"); @@ -1222,7 +1223,8 @@ class ExternalSemaphoreCmd : public Command { public: ExternalSemaphoreCmd(HostQueue& queue, const void* sem_ptr, uint64_t fence, ExternalSemaphoreCmdType cmd_type) - : Command::Command(queue, CL_COMMAND_USER), sem_ptr_(sem_ptr), fence_(fence), cmd_type_(cmd_type) {} + : Command::Command(queue, CL_COMMAND_USER), sem_ptr_(sem_ptr), fence_(fence), + cmd_type_(cmd_type) {} virtual void submit(device::VirtualDevice& device) { device.submitExternalSemaphoreCmd(*this); @@ -1239,12 +1241,28 @@ class Marker : public Command { //! Create a new Marker Marker(HostQueue& queue, bool userVisible, const EventWaitList& eventWaitList = nullWaitList, const Event* waitingEvent = nullptr, bool cpu_wait = false) - : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent) { cpu_wait_ = cpu_wait; } + : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent) + { + cpu_wait_ = cpu_wait; + } //! The actual command implementation. virtual void submit(device::VirtualDevice& device) { device.submitMarker(*this); } }; +class AccumulateCommand : public Command { + public: + //! Create a new Marker + AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList, + const Event* waitingEvent = nullptr) + : Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent) {} + + //! The command implementation + virtual void submit(device::VirtualDevice& device) { + device.submitAccumulate(*this); + } +}; + /*! \brief Maps CL objects created from external ones and syncs the contents (blocking). * */ @@ -1516,8 +1534,9 @@ class SvmFreeMemoryCommand : public Command { void* userData_; //!< Data passed to user-defined callback public: - SvmFreeMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList, uint32_t numSvmPointers, - void** svmPointers, freeCallBack pfnFreeFunc, void* userData) + SvmFreeMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList, + uint32_t numSvmPointers, void** svmPointers, + freeCallBack pfnFreeFunc, void* userData) : Command(queue, CL_COMMAND_SVM_FREE, eventWaitList), //! We copy svmPointers since it can be reused/deallocated after // command creation