SWDEV-422207 - Track commands for capture
- Track all captured commands under a new AccumulateCommand - Add begin() and end() methods to capture commands - Explicit TS object now passed to certain methods because profilingBegin() and profilingEnd() now happen separately and thus can run into threading issues Change-Id: I171106bdcad72b057836cb2f3fc398db3533119f
Этот коммит содержится в:
@@ -529,9 +529,14 @@ hipError_t GraphExec::Run(hipStream_t stream) {
|
||||
}
|
||||
|
||||
if (parallelLists_.size() == 1) {
|
||||
amd::AccumulateCommand* accumulate = nullptr;
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
accumulate = new amd::AccumulateCommand(*hip_stream);
|
||||
}
|
||||
|
||||
for (int i = 0; i < topoOrder_.size(); i++) {
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) {
|
||||
hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket());
|
||||
hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate);
|
||||
} else {
|
||||
topoOrder_[i]->SetStream(hip_stream, this);
|
||||
status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
|
||||
@@ -540,13 +545,8 @@ hipError_t GraphExec::Run(hipStream_t stream) {
|
||||
}
|
||||
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
amd::Command* endCommand = nullptr;
|
||||
endCommand = new amd::Marker(*hip_stream, false);
|
||||
// Since the end command is for graph completion tracking,
|
||||
// it may not need release scopes
|
||||
endCommand->setEventScope(amd::Device::kCacheStateIgnore);
|
||||
endCommand->enqueue();
|
||||
endCommand->release();
|
||||
accumulate->enqueue();
|
||||
accumulate->release();
|
||||
}
|
||||
} else {
|
||||
UpdateStream(parallelLists_, hip_stream, this);
|
||||
|
||||
@@ -79,6 +79,7 @@ class PerfCounterCommand;
|
||||
class ReleaseObjectCommand;
|
||||
class StallQueueCommand;
|
||||
class Marker;
|
||||
class AccumulateCommand;
|
||||
class ThreadTraceCommand;
|
||||
class ThreadTraceMemObjectsCommand;
|
||||
class SignalCommand;
|
||||
@@ -1248,6 +1249,7 @@ class VirtualDevice : public amd::HeapObject {
|
||||
virtual void submitKernel(amd::NDRangeKernelCommand& command) = 0;
|
||||
virtual void submitNativeFn(amd::NativeFnCommand& cmd) = 0;
|
||||
virtual void submitMarker(amd::Marker& cmd) = 0;
|
||||
virtual void submitAccumulate(amd::AccumulateCommand& cmd) = 0;
|
||||
virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) = 0;
|
||||
virtual void submitFillMemory(amd::FillMemoryCommand& cmd) = 0;
|
||||
virtual void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd) = 0;
|
||||
@@ -1287,7 +1289,9 @@ class VirtualDevice : public amd::HeapObject {
|
||||
|
||||
//! Returns fence state of the VirtualGPU
|
||||
virtual bool isFenceDirty() const = 0;
|
||||
virtual bool dispatchAqlPacket(uint8_t* aqlpacket) = 0;
|
||||
|
||||
//! Dispatch captured AQL packet
|
||||
virtual bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr) = 0;
|
||||
|
||||
//! Resets fence state of the VirtualGPU
|
||||
virtual void resetFenceDirty() = 0;
|
||||
|
||||
@@ -2709,6 +2709,7 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) {
|
||||
Unimplemented(); //!< @todo: Unimplemented
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitMarker(amd::Marker& vcmd) {
|
||||
//!@note runtime doesn't need to lock this command on execution
|
||||
|
||||
@@ -2735,6 +2736,11 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {
|
||||
|
||||
const Pal::IQueueSemaphore* sem = reinterpret_cast<const Pal::IQueueSemaphore*>(cmd.sem_ptr());
|
||||
@@ -2748,10 +2754,8 @@ void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {
|
||||
queues_[MainEngine]->iQueue_->WaitQueueSemaphore(const_cast<Pal::IQueueSemaphore*>(sem),
|
||||
cmd.fence());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void VirtualGPU::releaseMemory(GpuMemoryReference* mem) {
|
||||
queues_[MainEngine]->removeCmdMemRef(mem);
|
||||
if (!dev().settings().disableSdma_) {
|
||||
|
||||
@@ -318,6 +318,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
void submitFillMemory(amd::FillMemoryCommand& vcmd);
|
||||
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
|
||||
void submitMarker(amd::Marker& vcmd);
|
||||
void submitAccumulate(amd::AccumulateCommand& vcmd);
|
||||
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
|
||||
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
|
||||
void submitPerfCounter(amd::PerfCounterCommand& vcmd);
|
||||
@@ -342,7 +343,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
bool isFenceDirty() const { return false; }
|
||||
|
||||
inline bool dispatchAqlPacket(uint8_t* aqlpacket) { return false; }
|
||||
inline bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr) {
|
||||
return false; }
|
||||
|
||||
void resetFenceDirty() {}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* Copyright (c) 2013 - 2022 Advanced Micro Devices, Inc.
|
||||
/* Copyright (c) 2013 - 2023 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -127,7 +127,6 @@ void Timestamp::checkGpuTime() {
|
||||
|
||||
for (auto it : signals_) {
|
||||
amd::ScopedLock lock(it->LockSignalOps());
|
||||
|
||||
// Ignore the wait if runtime processes API callback, because the signal value is bigger
|
||||
// than expected and the value reset will occur after API callback is done
|
||||
if (GetCallbackSignal().handle == 0) {
|
||||
@@ -149,7 +148,8 @@ void Timestamp::checkGpuTime() {
|
||||
start = std::min(time.start, start);
|
||||
end = std::max(time.end, end);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
|
||||
"end = %ld time taken= %ld ns", it->signal_.handle, start, end, end - start);
|
||||
"end = %ld time taken= %ld ns", it->signal_.handle, time.start, time.end,
|
||||
time.end - time.start);
|
||||
}
|
||||
it->flags_.done_ = true;
|
||||
}
|
||||
@@ -848,6 +848,13 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
if (timestamp_ != nullptr) {
|
||||
// Get active signal for current dispatch if profiling is necessary
|
||||
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
|
||||
|
||||
// If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can
|
||||
// retrieve this correlation ID to attribute waves to specific dispatch locations.
|
||||
if (std::is_same<decltype(packet), hsa_kernel_dispatch_packet_t*>::value) {
|
||||
auto dispatchPacket = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet);
|
||||
dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_;
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure the slot is free for usage
|
||||
@@ -954,6 +961,24 @@ bool VirtualGPU::dispatchAqlPacket(
|
||||
return dispatchGenericAqlPacket(packet, header, rest, blocking);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd) {
|
||||
amd::ScopedLock lock(execution());
|
||||
if (vcmd != nullptr) {
|
||||
profilingBegin(*vcmd, true, true);
|
||||
}
|
||||
dispatchBlockingWait();
|
||||
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
|
||||
|
||||
constexpr size_t kPacketSize = 1;
|
||||
Timestamp* ts = reinterpret_cast<Timestamp*>(vcmd->data());
|
||||
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
|
||||
if (vcmd != nullptr) {
|
||||
profilingEnd(*vcmd, true);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
|
||||
const uint32_t gfxVersion, bool blocking,
|
||||
@@ -1056,21 +1081,6 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
|
||||
barrier_packet_.dep_signal[4] = hsa_signal_t{};
|
||||
}
|
||||
|
||||
inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket) {
|
||||
dispatchBlockingWait();
|
||||
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
|
||||
// If rocprof tracing is enabled, store the correlation ID in the dispatch packet.
|
||||
// The profiler can retrieve this correlation ID to attribute waves to specific dispatch
|
||||
// locations.
|
||||
if (activity_prof::IsEnabled(OP_ID_DISPATCH) || profiling_) {
|
||||
packet->reserved2 = activity_prof::correlation_id;
|
||||
// Get active signal for current dispatch if profiling is necessary
|
||||
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
|
||||
}
|
||||
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveDepSignal,
|
||||
hsa_signal_t signal, hsa_signal_value_t value,
|
||||
@@ -1430,17 +1440,23 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
|
||||
* virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
|
||||
* and then calls start() to get the current host timestamp.
|
||||
*/
|
||||
void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
|
||||
void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool useCommandTs) {
|
||||
if (command.profilingInfo().enabled_) {
|
||||
if (timestamp_ != nullptr) {
|
||||
LogWarning("Trying to create a second timestamp in VirtualGPU. \
|
||||
This could have unintended consequences.");
|
||||
return;
|
||||
}
|
||||
// Without barrier profiling will wait for each individual signal
|
||||
timestamp_ = new Timestamp(this, command);
|
||||
command.setData(timestamp_);
|
||||
timestamp_->start();
|
||||
Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
|
||||
|
||||
if (ts == nullptr) {
|
||||
// Without barrier profiling will wait for each individual signal
|
||||
timestamp_ = new Timestamp(this, command);
|
||||
command.setData(timestamp_);
|
||||
timestamp_->start();
|
||||
} else {
|
||||
timestamp_ = ts;
|
||||
}
|
||||
|
||||
// Enable SDMA profiling on the first access if profiling is set
|
||||
// Its not per command basis
|
||||
@@ -1473,10 +1489,11 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
|
||||
* created for whatever command we are running and calls end() to get the
|
||||
* current host timestamp if no signal is available.
|
||||
*/
|
||||
void VirtualGPU::profilingEnd(amd::Command& command) {
|
||||
void VirtualGPU::profilingEnd(amd::Command& command, bool useCommandTs) {
|
||||
if (command.profilingInfo().enabled_) {
|
||||
if (timestamp_->HwProfiling() == false) {
|
||||
timestamp_->end();
|
||||
Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
|
||||
if (ts->HwProfiling() == false) {
|
||||
ts->end();
|
||||
}
|
||||
timestamp_ = nullptr;
|
||||
}
|
||||
@@ -3238,11 +3255,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
addSystemScope_ = true;
|
||||
}
|
||||
|
||||
// If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can
|
||||
// retrieve this correlation ID to attribute waves to specific dispatch locations.
|
||||
if (vcmd != nullptr && vcmd->profilingInfo().enabled_) {
|
||||
dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_;
|
||||
}
|
||||
|
||||
// Copy scheduler's AQL packet for possible relaunch from the scheduler itself
|
||||
if (aql_packet != nullptr) {
|
||||
@@ -3374,7 +3386,6 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
|
||||
// std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
@@ -3413,6 +3424,20 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
profilingBegin(vcmd, true, true);
|
||||
const Settings& settings = dev().settings();
|
||||
if (settings.barrier_value_packet_) {
|
||||
dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
|
||||
} else {
|
||||
dispatchBarrierPacket(kNopPacketHeader, false);
|
||||
}
|
||||
profilingEnd(vcmd, true);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* Copyright (c) 2008 - 2022 Advanced Micro Devices, Inc.
|
||||
/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -299,8 +299,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
bool create();
|
||||
const Device& dev() const { return roc_device_; }
|
||||
|
||||
void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
|
||||
void profilingEnd(amd::Command& command);
|
||||
void profilingBegin(amd::Command& command, bool sdmaProfiling = false, bool useCommandTs = false);
|
||||
void profilingEnd(amd::Command& command, bool useCommandTs = false);
|
||||
|
||||
void updateCommandsState(amd::Command* list) const;
|
||||
|
||||
@@ -321,7 +321,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
);
|
||||
void submitNativeFn(amd::NativeFnCommand& cmd);
|
||||
void submitMarker(amd::Marker& cmd);
|
||||
|
||||
void submitAccumulate(amd::AccumulateCommand& cmd);
|
||||
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd);
|
||||
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd);
|
||||
void submitPerfCounter(amd::PerfCounterCommand& cmd);
|
||||
@@ -416,7 +416,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Dispatches a barrier with blocking HSA signals
|
||||
void dispatchBlockingWait();
|
||||
|
||||
inline bool dispatchAqlPacket(uint8_t* aqlpacket);
|
||||
inline bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr);
|
||||
bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest,
|
||||
bool blocking = true, bool capturing = false,
|
||||
const uint8_t* aqlPacket = nullptr);
|
||||
|
||||
@@ -362,7 +362,7 @@ void Command::enqueue() {
|
||||
ScopedLock sl(queue_->vdev()->execution());
|
||||
queue_->FormSubmissionBatch(this);
|
||||
|
||||
if (type() == CL_COMMAND_MARKER || type() == 0) {
|
||||
if (type() == CL_COMMAND_MARKER || type() == 0 || type() == CL_COMMAND_TASK) {
|
||||
// The current HSA signal tracking logic requires profiling enabled for the markers
|
||||
EnableProfiling();
|
||||
// Update batch head for the current marker. Hence the status of all commands can be
|
||||
|
||||
@@ -269,7 +269,8 @@ class Command : public Event {
|
||||
uint32_t commandWaitBits_;
|
||||
|
||||
//! Construct a new command of the given OpenCL type.
|
||||
Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList = nullWaitList,
|
||||
Command(HostQueue& queue, cl_command_type type,
|
||||
const EventWaitList& eventWaitList = nullWaitList,
|
||||
uint32_t commandWaitBits = 0, const Event* waitingEvent = nullptr);
|
||||
|
||||
//! Construct a new command of the given OpenCL type.
|
||||
@@ -857,7 +858,7 @@ class CopyMemoryCommand : public TwoMemoryArgsCommand {
|
||||
: TwoMemoryArgsCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory),
|
||||
srcOrigin_(srcOrigin),
|
||||
dstOrigin_(dstOrigin),
|
||||
size_(size),
|
||||
size_(size),
|
||||
copyMetadata_(copyMetadata){
|
||||
// Sanity checks
|
||||
assert(size.c[0] > 0 && "invalid");
|
||||
@@ -1222,7 +1223,8 @@ class ExternalSemaphoreCmd : public Command {
|
||||
public:
|
||||
ExternalSemaphoreCmd(HostQueue& queue, const void* sem_ptr, uint64_t fence,
|
||||
ExternalSemaphoreCmdType cmd_type)
|
||||
: Command::Command(queue, CL_COMMAND_USER), sem_ptr_(sem_ptr), fence_(fence), cmd_type_(cmd_type) {}
|
||||
: Command::Command(queue, CL_COMMAND_USER), sem_ptr_(sem_ptr), fence_(fence),
|
||||
cmd_type_(cmd_type) {}
|
||||
|
||||
virtual void submit(device::VirtualDevice& device) {
|
||||
device.submitExternalSemaphoreCmd(*this);
|
||||
@@ -1239,12 +1241,28 @@ class Marker : public Command {
|
||||
//! Create a new Marker
|
||||
Marker(HostQueue& queue, bool userVisible, const EventWaitList& eventWaitList = nullWaitList,
|
||||
const Event* waitingEvent = nullptr, bool cpu_wait = false)
|
||||
: Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent) { cpu_wait_ = cpu_wait; }
|
||||
: Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent)
|
||||
{
|
||||
cpu_wait_ = cpu_wait;
|
||||
}
|
||||
|
||||
//! The actual command implementation.
|
||||
virtual void submit(device::VirtualDevice& device) { device.submitMarker(*this); }
|
||||
};
|
||||
|
||||
class AccumulateCommand : public Command {
|
||||
public:
|
||||
//! Create a new Marker
|
||||
AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList,
|
||||
const Event* waitingEvent = nullptr)
|
||||
: Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent) {}
|
||||
|
||||
//! The command implementation
|
||||
virtual void submit(device::VirtualDevice& device) {
|
||||
device.submitAccumulate(*this);
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Maps CL objects created from external ones and syncs the contents (blocking).
|
||||
*
|
||||
*/
|
||||
@@ -1516,8 +1534,9 @@ class SvmFreeMemoryCommand : public Command {
|
||||
void* userData_; //!< Data passed to user-defined callback
|
||||
|
||||
public:
|
||||
SvmFreeMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList, uint32_t numSvmPointers,
|
||||
void** svmPointers, freeCallBack pfnFreeFunc, void* userData)
|
||||
SvmFreeMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList,
|
||||
uint32_t numSvmPointers, void** svmPointers,
|
||||
freeCallBack pfnFreeFunc, void* userData)
|
||||
: Command(queue, CL_COMMAND_SVM_FREE, eventWaitList),
|
||||
//! We copy svmPointers since it can be reused/deallocated after
|
||||
// command creation
|
||||
|
||||
Ссылка в новой задаче
Block a user