SWDEV-422207 - Optimize graph end detection
- Do not use extra barrier to detect graph end. If its a kernel node we can use a completion signal for the last packet. Saves roughly 6us for Phantom testcase per graph launch. Change-Id: I5e0c2479d9964fbeda86ed97533f6718f49a7f91
Tento commit je obsažen v:
@@ -541,11 +541,14 @@ hipError_t GraphExec::Run(hipStream_t stream) {
|
||||
|
||||
if (parallelLists_.size() == 1) {
|
||||
amd::AccumulateCommand* accumulate = nullptr;
|
||||
bool isLastPacketKernel = false;
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
accumulate = new amd::AccumulateCommand(*hip_stream);
|
||||
uint8_t* lastCapturedPacket = (topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) ?
|
||||
topoOrder_.back()->GetAqlPacket() : nullptr;
|
||||
accumulate = new amd::AccumulateCommand(*hip_stream, {}, nullptr, lastCapturedPacket);
|
||||
}
|
||||
|
||||
for (int i = 0; i < topoOrder_.size(); i++) {
|
||||
for (int i = 0; i < topoOrder_.size() - 1; i++) {
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) {
|
||||
hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate);
|
||||
} else {
|
||||
@@ -555,7 +558,20 @@ hipError_t GraphExec::Run(hipStream_t stream) {
|
||||
}
|
||||
}
|
||||
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
// If last captured packet is kernel, optimize to detect completion of last kernel
|
||||
// This saves on extra packet submitted to determine end of graph
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) {
|
||||
accumulate->enqueue();
|
||||
accumulate->release();
|
||||
isLastPacketKernel = true;
|
||||
} else {
|
||||
topoOrder_.back()->SetStream(hip_stream, this);
|
||||
status = topoOrder_.back()->CreateCommand(topoOrder_.back()->GetQueue());
|
||||
topoOrder_.back()->EnqueueCommands(stream);
|
||||
}
|
||||
|
||||
// If last packet is not kernel, submit a marker to detect end of graph
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && !isLastPacketKernel) {
|
||||
accumulate->enqueue();
|
||||
accumulate->release();
|
||||
}
|
||||
|
||||
@@ -974,10 +974,8 @@ inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCom
|
||||
profilingBegin(*vcmd, true, true);
|
||||
}
|
||||
dispatchBlockingWait();
|
||||
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
|
||||
|
||||
constexpr size_t kPacketSize = 1;
|
||||
Timestamp* ts = reinterpret_cast<Timestamp*>(vcmd->data());
|
||||
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
|
||||
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
|
||||
if (vcmd != nullptr) {
|
||||
profilingEnd(*vcmd, true);
|
||||
@@ -3435,12 +3433,22 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
profilingBegin(vcmd, true, true);
|
||||
const Settings& settings = dev().settings();
|
||||
if (settings.barrier_value_packet_) {
|
||||
dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
|
||||
|
||||
uint8_t* aqlPacket = vcmd.getLastPacket();
|
||||
if (aqlPacket != nullptr) {
|
||||
dispatchBlockingWait();
|
||||
constexpr size_t kPacketSize = 1;
|
||||
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlPacket);
|
||||
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
|
||||
} else {
|
||||
dispatchBarrierPacket(kNopPacketHeader, false);
|
||||
const Settings& settings = dev().settings();
|
||||
if (settings.barrier_value_packet_) {
|
||||
dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
|
||||
} else {
|
||||
dispatchBarrierPacket(kNopPacketHeader, false);
|
||||
}
|
||||
}
|
||||
|
||||
profilingEnd(vcmd, true);
|
||||
}
|
||||
|
||||
|
||||
@@ -1261,14 +1261,19 @@ class Marker : public Command {
|
||||
};
|
||||
|
||||
class AccumulateCommand : public Command {
|
||||
private:
|
||||
uint8_t* lastPacket_;
|
||||
public:
|
||||
//! Create a new Marker
|
||||
AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList,
|
||||
const Event* waitingEvent = nullptr)
|
||||
: Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent) {
|
||||
const Event* waitingEvent = nullptr, uint8_t* lastPacket = nullptr)
|
||||
: Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent),
|
||||
lastPacket_(lastPacket)
|
||||
{
|
||||
profilingInfo_.multiple_ts_ = true;
|
||||
}
|
||||
|
||||
// Return last packet
|
||||
uint8_t* getLastPacket() const { return lastPacket_; }
|
||||
//! The command implementation
|
||||
virtual void submit(device::VirtualDevice& device) {
|
||||
device.submitAccumulate(*this);
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele