SWDEV-422207 - Report kernel names for activity profiling
- Report kernel names for optimized graph path
- Refactor code so that we store profiling info in Accumulate command
Change-Id: Ib97735a0239aeb9fc3a50a4bb7126dd0bcadc8af
[ROCm/clr commit: b056686607]
Bu işleme şunda yer alıyor:
@@ -551,6 +551,7 @@ hipError_t GraphExec::Run(hipStream_t stream) {
|
||||
for (int i = 0; i < topoOrder_.size() - 1; i++) {
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) {
|
||||
hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate);
|
||||
accumulate->addKernelName(topoOrder_[i]->GetKernelName());
|
||||
} else {
|
||||
topoOrder_[i]->SetStream(hip_stream, this);
|
||||
status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
|
||||
@@ -561,6 +562,8 @@ hipError_t GraphExec::Run(hipStream_t stream) {
|
||||
// If last captured packet is kernel, optimize to detect completion of last kernel
|
||||
// This saves on extra packet submitted to determine end of graph
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) {
|
||||
// Add the last kernel node name to the accumulate command
|
||||
accumulate->addKernelName(topoOrder_.back()->GetKernelName());
|
||||
accumulate->enqueue();
|
||||
accumulate->release();
|
||||
isLastPacketKernel = true;
|
||||
|
||||
@@ -183,6 +183,7 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
|
||||
static amd::Monitor nodeSetLock_;
|
||||
unsigned int isEnabled_;
|
||||
uint8_t gpuPacket_[64]; //!< GPU Packet to enqueue during graph launch
|
||||
std::string capturedKernelName_;
|
||||
|
||||
public:
|
||||
GraphNode(hipGraphNodeType type, std::string style = "", std::string shape = "",
|
||||
@@ -232,7 +233,10 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
|
||||
}
|
||||
// Return gpu packet address to update with actual packet under capture.
|
||||
uint8_t* GetAqlPacket() { return gpuPacket_; }
|
||||
hip::Stream* GetQueue() { return stream_; }
|
||||
void SetKernelName(std::string kernelName) { capturedKernelName_ = kernelName; }
|
||||
const std::string& GetKernelName() const { return capturedKernelName_; }
|
||||
|
||||
hip::Stream* GetQueue() const { return stream_; }
|
||||
|
||||
virtual void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) {
|
||||
stream_ = stream;
|
||||
@@ -805,8 +809,11 @@ class GraphKernelNode : public GraphNode {
|
||||
reinterpret_cast<amd::NDRangeKernelCommand*>(command)->setCapturingState(
|
||||
true, GetAqlPacket(), kernArgOffset);
|
||||
|
||||
// Enqueue command to capture GPU Packet. Packet is not sent to hardware queue.
|
||||
// Enqueue command to capture GPU Packet. The packet is not submitted to the device.
|
||||
// The packet is stored in gpuPacket_ and submitted during graph launch.
|
||||
command->submit(*(command->queue())->vdev());
|
||||
// Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes
|
||||
SetKernelName(reinterpret_cast<amd::NDRangeKernelCommand*>(command)->kernel().name());
|
||||
command->release();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,7 +135,7 @@ void Timestamp::checkGpuTime() {
|
||||
// Avoid profiling data for the sync barrier, in tiny performance tests the first call
|
||||
// to ROCr is very slow and that also affects the overall performance of the callback thread
|
||||
if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_
|
||||
|| command().profilingInfo().multiple_ts_) {
|
||||
|| command().type() == CL_COMMAND_TASK) {
|
||||
hsa_amd_profiling_dispatch_time_t time = {};
|
||||
if (it->engine_ == HwQueueEngine::Compute) {
|
||||
hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
|
||||
@@ -149,8 +149,8 @@ void Timestamp::checkGpuTime() {
|
||||
start = std::min(time.start, start);
|
||||
end = std::max(time.end, end);
|
||||
|
||||
if (command().profilingInfo().multiple_ts_) {
|
||||
command().AddTimeStamps(time.start, time.end);
|
||||
if (command().type() == CL_COMMAND_TASK) {
|
||||
static_cast<amd::AccumulateCommand&>(command()).addTimestamps(time.start, time.end);
|
||||
}
|
||||
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
|
||||
|
||||
@@ -101,10 +101,14 @@ void ReportActivity(const amd::Command& command) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (command.profilingInfo().tsList_.size() > 0) {
|
||||
for (auto& it : command.profilingInfo().tsList_) {
|
||||
if (command.type() == CL_COMMAND_TASK) {
|
||||
auto timestamps = static_cast<const amd::AccumulateCommand&>(command).getTimestamps();
|
||||
for (uint32_t i = 0; i < timestamps.size(); i++) {
|
||||
auto it = timestamps[i];
|
||||
record.begin_ns = it.first;
|
||||
record.end_ns = it.second;
|
||||
record.kernel_name =
|
||||
static_cast<const amd::AccumulateCommand&>(command).getKernelNames()[i].c_str();
|
||||
function(ACTIVITY_DOMAIN_HIP_OPS, operation_id, &record);
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -105,7 +105,7 @@ class Event : public RuntimeObject {
|
||||
|
||||
struct ProfilingInfo {
|
||||
ProfilingInfo(bool enabled = false)
|
||||
: enabled_(enabled), marker_ts_(false), multiple_ts_(false) {
|
||||
: enabled_(enabled), marker_ts_(false) {
|
||||
if (enabled) {
|
||||
clear();
|
||||
correlation_id_ = activity_prof::correlation_id;
|
||||
@@ -116,20 +116,17 @@ class Event : public RuntimeObject {
|
||||
uint64_t submitted_;
|
||||
uint64_t start_;
|
||||
uint64_t end_;
|
||||
std::vector<std::pair<uint64_t, uint64_t>> tsList_;
|
||||
|
||||
uint64_t correlation_id_;
|
||||
bool enabled_; //!< Profiling enabled for the wave limiter
|
||||
bool marker_ts_; //!< TS marker
|
||||
bool multiple_ts_; //!< Multiple TS
|
||||
|
||||
void clear() {
|
||||
queued_ = 0ULL;
|
||||
submitted_ = 0ULL;
|
||||
start_ = 0ULL;
|
||||
end_ = 0ULL;
|
||||
tsList_.clear();
|
||||
}
|
||||
}
|
||||
} profilingInfo_;
|
||||
|
||||
//! Construct a new event.
|
||||
@@ -225,11 +222,6 @@ class Event : public RuntimeObject {
|
||||
|
||||
//! Set release scope for the event
|
||||
void setEventScope(int32_t scope) { event_scope_ = scope; }
|
||||
|
||||
//! Add a timestamp to the list
|
||||
void AddTimeStamps(uint64_t start, uint64_t end) {
|
||||
profilingInfo_.tsList_.push_back(std::make_pair(start, end));
|
||||
}
|
||||
};
|
||||
|
||||
union CopyMetadata {
|
||||
@@ -1263,17 +1255,45 @@ class Marker : public Command {
|
||||
class AccumulateCommand : public Command {
|
||||
private:
|
||||
uint8_t* lastPacket_;
|
||||
|
||||
//! Kernel names and timestamps list for activity profiling
|
||||
std::vector<std::string> kernelNames_;
|
||||
std::vector<std::pair<uint64_t, uint64_t>> tsList_;
|
||||
|
||||
public:
|
||||
//! Create a new Marker
|
||||
AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList,
|
||||
const Event* waitingEvent = nullptr, uint8_t* lastPacket = nullptr)
|
||||
: Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent),
|
||||
lastPacket_(lastPacket)
|
||||
{
|
||||
profilingInfo_.multiple_ts_ = true;
|
||||
}
|
||||
// Return last packet
|
||||
{}
|
||||
//! Return last packet
|
||||
uint8_t* getLastPacket() const { return lastPacket_; }
|
||||
|
||||
//! Add kernel name to the list if available
|
||||
void addKernelName(const std::string& kernelName) {
|
||||
if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
|
||||
kernelNames_.push_back(kernelName);
|
||||
}
|
||||
}
|
||||
|
||||
//! Add kernel timestamp to the list if available
|
||||
void addTimestamps(uint64_t startTs, uint64_t endTs) {
|
||||
if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
|
||||
tsList_.push_back(std::make_pair(startTs, endTs));
|
||||
}
|
||||
}
|
||||
|
||||
//! Return the kernel names
|
||||
const std::vector<std::string>& getKernelNames() const {
|
||||
return kernelNames_;
|
||||
}
|
||||
|
||||
//! Return the kernel timestamps
|
||||
const std::vector<std::pair<uint64_t, uint64_t>>& getTimestamps() const {
|
||||
return tsList_;
|
||||
}
|
||||
|
||||
//! The command implementation
|
||||
virtual void submit(device::VirtualDevice& device) {
|
||||
device.submitAccumulate(*this);
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle