diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp index e1fcd954e6..d4d07c4753 100644 --- a/hipamd/src/hip_graph_internal.cpp +++ b/hipamd/src/hip_graph_internal.cpp @@ -541,11 +541,14 @@ hipError_t GraphExec::Run(hipStream_t stream) { if (parallelLists_.size() == 1) { amd::AccumulateCommand* accumulate = nullptr; + bool isLastPacketKernel = false; if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { - accumulate = new amd::AccumulateCommand(*hip_stream); + uint8_t* lastCapturedPacket = (topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) ? + topoOrder_.back()->GetAqlPacket() : nullptr; + accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr, lastCapturedPacket); } - for (int i = 0; i < topoOrder_.size(); i++) { + for (int i = 0; i < topoOrder_.size() - 1; i++) { if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) { hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate); } else { @@ -555,7 +558,20 @@ hipError_t GraphExec::Run(hipStream_t stream) { } } - if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { + // If last captured packet is kernel, optimize to detect completion of last kernel + // This saves on extra packet submitted to determine end of graph + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) { + accumulate->enqueue(); + accumulate->release(); + isLastPacketKernel = true; + } else { + topoOrder_.back()->SetStream(hip_stream, this); + status = topoOrder_.back()->CreateCommand(topoOrder_.back()->GetQueue()); + topoOrder_.back()->EnqueueCommands(stream); + } + + // If last packet is not kernel, submit a marker to detect end of graph + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && !isLastPacketKernel) { accumulate->enqueue(); accumulate->release(); } diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 36658f3884..eedae46f82 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -974,10 +974,8 @@ inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCom profilingBegin(*vcmd, true, true); } dispatchBlockingWait(); - auto packet = reinterpret_cast(aqlpacket); - constexpr size_t kPacketSize = 1; - Timestamp* ts = reinterpret_cast(vcmd->data()); + auto packet = reinterpret_cast(aqlpacket); dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize); if (vcmd != nullptr) { profilingEnd(*vcmd, true); @@ -3435,12 +3433,22 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd, true, true); - const Settings& settings = dev().settings(); - if (settings.barrier_value_packet_) { - dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true); + + uint8_t* aqlPacket = vcmd.getLastPacket(); + if (aqlPacket != nullptr) { + dispatchBlockingWait(); + constexpr size_t kPacketSize = 1; + auto packet = reinterpret_cast(aqlPacket); + dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize); } else { - dispatchBarrierPacket(kNopPacketHeader, false); + const Settings& settings = dev().settings(); + if (settings.barrier_value_packet_) { + dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true); + } else { + dispatchBarrierPacket(kNopPacketHeader, false); + } } + profilingEnd(vcmd, true); } diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index f5f6e92554..beb852ec9a 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -1261,14 +1261,19 @@ class Marker : public Command { }; class AccumulateCommand : public Command { + private: + uint8_t* lastPacket_; public: //! Create a new Marker AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList, - const Event* waitingEvent = nullptr) - : Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent) { + const Event* waitingEvent = nullptr, uint8_t* lastPacket = nullptr) + : Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent), + lastPacket_(lastPacket) + { profilingInfo_.multiple_ts_ = true; } - + // Return last packet + uint8_t* getLastPacket() const { return lastPacket_; } //! The command implementation virtual void submit(device::VirtualDevice& device) { device.submitAccumulate(*this);