SWDEV-422207 - Track commands for capture

- Track all captured commands under a new AccumulateCommand
- Add begin() and end() methods to capture commands
- Explicit TS object now passed to certain methods because
profilingBegin() and profilingEnd() now happen separately and thus can
run into threading issues

Change-Id: I171106bdcad72b057836cb2f3fc398db3533119f
Этот коммит содержится в:
Saleel Kudchadker
2023-10-26 20:06:18 +00:00
родитель 1338ff37e8
Коммит 40f41f4d0b
8 изменённых файлов: 110 добавлений и 56 удалений
+8 -8
Просмотреть файл
@@ -529,9 +529,14 @@ hipError_t GraphExec::Run(hipStream_t stream) {
}
if (parallelLists_.size() == 1) {
amd::AccumulateCommand* accumulate = nullptr;
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
accumulate = new amd::AccumulateCommand(*hip_stream);
}
for (int i = 0; i < topoOrder_.size(); i++) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) {
hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket());
hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate);
} else {
topoOrder_[i]->SetStream(hip_stream, this);
status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
@@ -540,13 +545,8 @@ hipError_t GraphExec::Run(hipStream_t stream) {
}
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
amd::Command* endCommand = nullptr;
endCommand = new amd::Marker(*hip_stream, false);
// Since the end command is for graph completion tracking,
// it may not need release scopes
endCommand->setEventScope(amd::Device::kCacheStateIgnore);
endCommand->enqueue();
endCommand->release();
accumulate->enqueue();
accumulate->release();
}
} else {
UpdateStream(parallelLists_, hip_stream, this);
+5 -1
Просмотреть файл
@@ -79,6 +79,7 @@ class PerfCounterCommand;
class ReleaseObjectCommand;
class StallQueueCommand;
class Marker;
class AccumulateCommand;
class ThreadTraceCommand;
class ThreadTraceMemObjectsCommand;
class SignalCommand;
@@ -1248,6 +1249,7 @@ class VirtualDevice : public amd::HeapObject {
virtual void submitKernel(amd::NDRangeKernelCommand& command) = 0;
virtual void submitNativeFn(amd::NativeFnCommand& cmd) = 0;
virtual void submitMarker(amd::Marker& cmd) = 0;
virtual void submitAccumulate(amd::AccumulateCommand& cmd) = 0;
virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) = 0;
virtual void submitFillMemory(amd::FillMemoryCommand& cmd) = 0;
virtual void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd) = 0;
@@ -1287,7 +1289,9 @@ class VirtualDevice : public amd::HeapObject {
//! Returns fence state of the VirtualGPU
virtual bool isFenceDirty() const = 0;
virtual bool dispatchAqlPacket(uint8_t* aqlpacket) = 0;
//! Dispatch captured AQL packet
virtual bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr) = 0;
//! Resets fence state of the VirtualGPU
virtual void resetFenceDirty() = 0;
+6 -2
Просмотреть файл
@@ -2709,6 +2709,7 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) {
Unimplemented(); //!< @todo: Unimplemented
}
// ================================================================================================
void VirtualGPU::submitMarker(amd::Marker& vcmd) {
//!@note runtime doesn't need to lock this command on execution
@@ -2735,6 +2736,11 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
}
}
// ================================================================================================
void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
}
// ================================================================================================
void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {
const Pal::IQueueSemaphore* sem = reinterpret_cast<const Pal::IQueueSemaphore*>(cmd.sem_ptr());
@@ -2748,10 +2754,8 @@ void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {
queues_[MainEngine]->iQueue_->WaitQueueSemaphore(const_cast<Pal::IQueueSemaphore*>(sem),
cmd.fence());
}
}
void VirtualGPU::releaseMemory(GpuMemoryReference* mem) {
queues_[MainEngine]->removeCmdMemRef(mem);
if (!dev().settings().disableSdma_) {
+3 -1
Просмотреть файл
@@ -318,6 +318,7 @@ class VirtualGPU : public device::VirtualDevice {
void submitFillMemory(amd::FillMemoryCommand& vcmd);
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
void submitMarker(amd::Marker& vcmd);
void submitAccumulate(amd::AccumulateCommand& vcmd);
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
void submitPerfCounter(amd::PerfCounterCommand& vcmd);
@@ -342,7 +343,8 @@ class VirtualGPU : public device::VirtualDevice {
bool isFenceDirty() const { return false; }
inline bool dispatchAqlPacket(uint8_t* aqlpacket) { return false; }
inline bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr) {
return false; }
void resetFenceDirty() {}
+57 -32
Просмотреть файл
@@ -1,4 +1,4 @@
/* Copyright (c) 2013 - 2022 Advanced Micro Devices, Inc.
/* Copyright (c) 2013 - 2023 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -127,7 +127,6 @@ void Timestamp::checkGpuTime() {
for (auto it : signals_) {
amd::ScopedLock lock(it->LockSignalOps());
// Ignore the wait if runtime processes API callback, because the signal value is bigger
// than expected and the value reset will occur after API callback is done
if (GetCallbackSignal().handle == 0) {
@@ -149,7 +148,8 @@ void Timestamp::checkGpuTime() {
start = std::min(time.start, start);
end = std::max(time.end, end);
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
"end = %ld time taken= %ld ns", it->signal_.handle, start, end, end - start);
"end = %ld time taken= %ld ns", it->signal_.handle, time.start, time.end,
time.end - time.start);
}
it->flags_.done_ = true;
}
@@ -848,6 +848,13 @@ bool VirtualGPU::dispatchGenericAqlPacket(
if (timestamp_ != nullptr) {
// Get active signal for current dispatch if profiling is necessary
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
// If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can
// retrieve this correlation ID to attribute waves to specific dispatch locations.
if (std::is_same<decltype(packet), hsa_kernel_dispatch_packet_t*>::value) {
auto dispatchPacket = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet);
dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_;
}
}
// Make sure the slot is free for usage
@@ -954,6 +961,24 @@ bool VirtualGPU::dispatchAqlPacket(
return dispatchGenericAqlPacket(packet, header, rest, blocking);
}
// ================================================================================================
inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd) {
amd::ScopedLock lock(execution());
if (vcmd != nullptr) {
profilingBegin(*vcmd, true, true);
}
dispatchBlockingWait();
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
constexpr size_t kPacketSize = 1;
Timestamp* ts = reinterpret_cast<Timestamp*>(vcmd->data());
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
if (vcmd != nullptr) {
profilingEnd(*vcmd, true);
}
return true;
}
// ================================================================================================
bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
const uint32_t gfxVersion, bool blocking,
@@ -1056,21 +1081,6 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
barrier_packet_.dep_signal[4] = hsa_signal_t{};
}
inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket) {
dispatchBlockingWait();
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
// If rocprof tracing is enabled, store the correlation ID in the dispatch packet.
// The profiler can retrieve this correlation ID to attribute waves to specific dispatch
// locations.
if (activity_prof::IsEnabled(OP_ID_DISPATCH) || profiling_) {
packet->reserved2 = activity_prof::correlation_id;
// Get active signal for current dispatch if profiling is necessary
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
}
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false);
return true;
}
// ================================================================================================
void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveDepSignal,
hsa_signal_t signal, hsa_signal_value_t value,
@@ -1430,17 +1440,23 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
* virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
* and then calls start() to get the current host timestamp.
*/
void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool useCommandTs) {
if (command.profilingInfo().enabled_) {
if (timestamp_ != nullptr) {
LogWarning("Trying to create a second timestamp in VirtualGPU. \
This could have unintended consequences.");
return;
}
// Without barrier profiling will wait for each individual signal
timestamp_ = new Timestamp(this, command);
command.setData(timestamp_);
timestamp_->start();
Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
if (ts == nullptr) {
// Without barrier profiling will wait for each individual signal
timestamp_ = new Timestamp(this, command);
command.setData(timestamp_);
timestamp_->start();
} else {
timestamp_ = ts;
}
// Enable SDMA profiling on the first access if profiling is set
// Its not per command basis
@@ -1473,10 +1489,11 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
* created for whatever command we are running and calls end() to get the
* current host timestamp if no signal is available.
*/
void VirtualGPU::profilingEnd(amd::Command& command) {
void VirtualGPU::profilingEnd(amd::Command& command, bool useCommandTs) {
if (command.profilingInfo().enabled_) {
if (timestamp_->HwProfiling() == false) {
timestamp_->end();
Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
if (ts->HwProfiling() == false) {
ts->end();
}
timestamp_ = nullptr;
}
@@ -3238,11 +3255,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
addSystemScope_ = true;
}
// If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can
// retrieve this correlation ID to attribute waves to specific dispatch locations.
if (vcmd != nullptr && vcmd->profilingInfo().enabled_) {
dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_;
}
// Copy scheduler's AQL packet for possible relaunch from the scheduler itself
if (aql_packet != nullptr) {
@@ -3374,7 +3386,6 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
// ================================================================================================
void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
// std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
}
// ================================================================================================
@@ -3413,6 +3424,20 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
}
}
// ================================================================================================
void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true, true);
const Settings& settings = dev().settings();
if (settings.barrier_value_packet_) {
dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
} else {
dispatchBarrierPacket(kNopPacketHeader, false);
}
profilingEnd(vcmd, true);
}
// ================================================================================================
void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
+5 -5
Просмотреть файл
@@ -1,4 +1,4 @@
/* Copyright (c) 2008 - 2022 Advanced Micro Devices, Inc.
/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -299,8 +299,8 @@ class VirtualGPU : public device::VirtualDevice {
bool create();
const Device& dev() const { return roc_device_; }
void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
void profilingEnd(amd::Command& command);
void profilingBegin(amd::Command& command, bool sdmaProfiling = false, bool useCommandTs = false);
void profilingEnd(amd::Command& command, bool useCommandTs = false);
void updateCommandsState(amd::Command* list) const;
@@ -321,7 +321,7 @@ class VirtualGPU : public device::VirtualDevice {
);
void submitNativeFn(amd::NativeFnCommand& cmd);
void submitMarker(amd::Marker& cmd);
void submitAccumulate(amd::AccumulateCommand& cmd);
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd);
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd);
void submitPerfCounter(amd::PerfCounterCommand& cmd);
@@ -416,7 +416,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Dispatches a barrier with blocking HSA signals
void dispatchBlockingWait();
inline bool dispatchAqlPacket(uint8_t* aqlpacket);
inline bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr);
bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest,
bool blocking = true, bool capturing = false,
const uint8_t* aqlPacket = nullptr);
+1 -1
Просмотреть файл
@@ -362,7 +362,7 @@ void Command::enqueue() {
ScopedLock sl(queue_->vdev()->execution());
queue_->FormSubmissionBatch(this);
if (type() == CL_COMMAND_MARKER || type() == 0) {
if (type() == CL_COMMAND_MARKER || type() == 0 || type() == CL_COMMAND_TASK) {
// The current HSA signal tracking logic requires profiling enabled for the markers
EnableProfiling();
// Update batch head for the current marker. Hence the status of all commands can be
+25 -6
Просмотреть файл
@@ -269,7 +269,8 @@ class Command : public Event {
uint32_t commandWaitBits_;
//! Construct a new command of the given OpenCL type.
Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList = nullWaitList,
Command(HostQueue& queue, cl_command_type type,
const EventWaitList& eventWaitList = nullWaitList,
uint32_t commandWaitBits = 0, const Event* waitingEvent = nullptr);
//! Construct a new command of the given OpenCL type.
@@ -857,7 +858,7 @@ class CopyMemoryCommand : public TwoMemoryArgsCommand {
: TwoMemoryArgsCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory),
srcOrigin_(srcOrigin),
dstOrigin_(dstOrigin),
size_(size),
size_(size),
copyMetadata_(copyMetadata){
// Sanity checks
assert(size.c[0] > 0 && "invalid");
@@ -1222,7 +1223,8 @@ class ExternalSemaphoreCmd : public Command {
public:
ExternalSemaphoreCmd(HostQueue& queue, const void* sem_ptr, uint64_t fence,
ExternalSemaphoreCmdType cmd_type)
: Command::Command(queue, CL_COMMAND_USER), sem_ptr_(sem_ptr), fence_(fence), cmd_type_(cmd_type) {}
: Command::Command(queue, CL_COMMAND_USER), sem_ptr_(sem_ptr), fence_(fence),
cmd_type_(cmd_type) {}
virtual void submit(device::VirtualDevice& device) {
device.submitExternalSemaphoreCmd(*this);
@@ -1239,12 +1241,28 @@ class Marker : public Command {
//! Create a new Marker
Marker(HostQueue& queue, bool userVisible, const EventWaitList& eventWaitList = nullWaitList,
const Event* waitingEvent = nullptr, bool cpu_wait = false)
: Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent) { cpu_wait_ = cpu_wait; }
: Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent)
{
cpu_wait_ = cpu_wait;
}
//! The actual command implementation.
virtual void submit(device::VirtualDevice& device) { device.submitMarker(*this); }
};
class AccumulateCommand : public Command {
public:
//! Create a new Marker
AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList,
const Event* waitingEvent = nullptr)
: Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent) {}
//! The command implementation
virtual void submit(device::VirtualDevice& device) {
device.submitAccumulate(*this);
}
};
/*! \brief Maps CL objects created from external ones and syncs the contents (blocking).
*
*/
@@ -1516,8 +1534,9 @@ class SvmFreeMemoryCommand : public Command {
void* userData_; //!< Data passed to user-defined callback
public:
SvmFreeMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList, uint32_t numSvmPointers,
void** svmPointers, freeCallBack pfnFreeFunc, void* userData)
SvmFreeMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList,
uint32_t numSvmPointers, void** svmPointers,
freeCallBack pfnFreeFunc, void* userData)
: Command(queue, CL_COMMAND_SVM_FREE, eventWaitList),
//! We copy svmPointers since it can be reused/deallocated after
// command creation