SWDEV-422207 - Optimize graph end detection

- Do not use extra barrier to detect graph end. If its a kernel node we
can use a completion signal for the last packet. Saves roughly 6us for
Phantom testcase per graph launch.

Change-Id: I5e0c2479d9964fbeda86ed97533f6718f49a7f91
Tento commit je obsažen v:
Saleel Kudchadker
2023-11-09 23:52:40 +00:00
rodič f06368fd04
revize c3bd229f4f
3 změnil soubory, kde provedl 42 přidání a 13 odebrání
+19 -3
Zobrazit soubor
@@ -541,11 +541,14 @@ hipError_t GraphExec::Run(hipStream_t stream) {
if (parallelLists_.size() == 1) {
amd::AccumulateCommand* accumulate = nullptr;
bool isLastPacketKernel = false;
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
accumulate = new amd::AccumulateCommand(*hip_stream);
uint8_t* lastCapturedPacket = (topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) ?
topoOrder_.back()->GetAqlPacket() : nullptr;
accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr, lastCapturedPacket);
}
for (int i = 0; i < topoOrder_.size(); i++) {
for (int i = 0; i < topoOrder_.size() - 1; i++) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) {
hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate);
} else {
@@ -555,7 +558,20 @@ hipError_t GraphExec::Run(hipStream_t stream) {
}
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
// If last captured packet is kernel, optimize to detect completion of last kernel
// This saves on extra packet submitted to determine end of graph
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) {
accumulate->enqueue();
accumulate->release();
isLastPacketKernel = true;
} else {
topoOrder_.back()->SetStream(hip_stream, this);
status = topoOrder_.back()->CreateCommand(topoOrder_.back()->GetQueue());
topoOrder_.back()->EnqueueCommands(stream);
}
// If last packet is not kernel, submit a marker to detect end of graph
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && !isLastPacketKernel) {
accumulate->enqueue();
accumulate->release();
}
+15 -7
Zobrazit soubor
@@ -974,10 +974,8 @@ inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCom
profilingBegin(*vcmd, true, true);
}
dispatchBlockingWait();
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
constexpr size_t kPacketSize = 1;
Timestamp* ts = reinterpret_cast<Timestamp*>(vcmd->data());
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
if (vcmd != nullptr) {
profilingEnd(*vcmd, true);
@@ -3435,12 +3433,22 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true, true);
const Settings& settings = dev().settings();
if (settings.barrier_value_packet_) {
dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
uint8_t* aqlPacket = vcmd.getLastPacket();
if (aqlPacket != nullptr) {
dispatchBlockingWait();
constexpr size_t kPacketSize = 1;
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlPacket);
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
} else {
dispatchBarrierPacket(kNopPacketHeader, false);
const Settings& settings = dev().settings();
if (settings.barrier_value_packet_) {
dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
} else {
dispatchBarrierPacket(kNopPacketHeader, false);
}
}
profilingEnd(vcmd, true);
}
+8 -3
Zobrazit soubor
@@ -1261,14 +1261,19 @@ class Marker : public Command {
};
class AccumulateCommand : public Command {
private:
uint8_t* lastPacket_;
public:
//! Create a new Marker
AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList,
const Event* waitingEvent = nullptr)
: Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent) {
const Event* waitingEvent = nullptr, uint8_t* lastPacket = nullptr)
: Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent),
lastPacket_(lastPacket)
{
profilingInfo_.multiple_ts_ = true;
}
// Return last packet
uint8_t* getLastPacket() const { return lastPacket_; }
//! The command implementation
virtual void submit(device::VirtualDevice& device) {
device.submitAccumulate(*this);