SWDEV-422207 - Report kernel names for activity profiling

- Report kernel names for optimized graph path
- Refactor code so that we store profiling info in Accumulate command

Change-Id: Ib97735a0239aeb9fc3a50a4bb7126dd0bcadc8af


[ROCm/clr commit: b056686607]
Bu işleme şunda yer alıyor:
Saleel Kudchadker
2023-11-15 04:27:56 +00:00
ebeveyn 153bb15f46
işleme cb9a715e04
5 değiştirilmiş dosya ile 55 ekleme ve 21 silme
+3
Dosyayı Görüntüle
@@ -551,6 +551,7 @@ hipError_t GraphExec::Run(hipStream_t stream) {
for (int i = 0; i < topoOrder_.size() - 1; i++) {
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_[i]->GetType() == hipGraphNodeTypeKernel) {
hip_stream->vdev()->dispatchAqlPacket(topoOrder_[i]->GetAqlPacket(), accumulate);
accumulate->addKernelName(topoOrder_[i]->GetKernelName());
} else {
topoOrder_[i]->SetStream(hip_stream, this);
status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
@@ -561,6 +562,8 @@ hipError_t GraphExec::Run(hipStream_t stream) {
// If last captured packet is kernel, optimize to detect completion of last kernel
// This saves on extra packet submitted to determine end of graph
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE && topoOrder_.back()->GetType() == hipGraphNodeTypeKernel) {
// Add the last kernel node name to the accumulate command
accumulate->addKernelName(topoOrder_.back()->GetKernelName());
accumulate->enqueue();
accumulate->release();
isLastPacketKernel = true;
+9 -2
Dosyayı Görüntüle
@@ -183,6 +183,7 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
static amd::Monitor nodeSetLock_;
unsigned int isEnabled_;
uint8_t gpuPacket_[64]; //!< GPU Packet to enqueue during graph launch
std::string capturedKernelName_;
public:
GraphNode(hipGraphNodeType type, std::string style = "", std::string shape = "",
@@ -232,7 +233,10 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
}
// Return gpu packet address to update with actual packet under capture.
uint8_t* GetAqlPacket() { return gpuPacket_; }
hip::Stream* GetQueue() { return stream_; }
void SetKernelName(std::string kernelName) { capturedKernelName_ = kernelName; }
const std::string& GetKernelName() const { return capturedKernelName_; }
hip::Stream* GetQueue() const { return stream_; }
virtual void SetStream(hip::Stream* stream, GraphExec* ptr = nullptr) {
stream_ = stream;
@@ -805,8 +809,11 @@ class GraphKernelNode : public GraphNode {
reinterpret_cast<amd::NDRangeKernelCommand*>(command)->setCapturingState(
true, GetAqlPacket(), kernArgOffset);
// Enqueue command to capture GPU Packet. Packet is not sent to hardware queue.
// Enqueue command to capture GPU Packet. The packet is not submitted to the device.
// The packet is stored in gpuPacket_ and submitted during graph launch.
command->submit(*(command->queue())->vdev());
// Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes
SetKernelName(reinterpret_cast<amd::NDRangeKernelCommand*>(command)->kernel().name());
command->release();
}
}
+3 -3
Dosyayı Görüntüle
@@ -135,7 +135,7 @@ void Timestamp::checkGpuTime() {
// Avoid profiling data for the sync barrier, in tiny performance tests the first call
// to ROCr is very slow and that also affects the overall performance of the callback thread
if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_
|| command().profilingInfo().multiple_ts_) {
|| command().type() == CL_COMMAND_TASK) {
hsa_amd_profiling_dispatch_time_t time = {};
if (it->engine_ == HwQueueEngine::Compute) {
hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
@@ -149,8 +149,8 @@ void Timestamp::checkGpuTime() {
start = std::min(time.start, start);
end = std::max(time.end, end);
if (command().profilingInfo().multiple_ts_) {
command().AddTimeStamps(time.start, time.end);
if (command().type() == CL_COMMAND_TASK) {
static_cast<amd::AccumulateCommand&>(command()).addTimestamps(time.start, time.end);
}
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
+6 -2
Dosyayı Görüntüle
@@ -101,10 +101,14 @@ void ReportActivity(const amd::Command& command) {
break;
}
if (command.profilingInfo().tsList_.size() > 0) {
for (auto& it : command.profilingInfo().tsList_) {
if (command.type() == CL_COMMAND_TASK) {
auto timestamps = static_cast<const amd::AccumulateCommand&>(command).getTimestamps();
for (uint32_t i = 0; i < timestamps.size(); i++) {
auto it = timestamps[i];
record.begin_ns = it.first;
record.end_ns = it.second;
record.kernel_name =
static_cast<const amd::AccumulateCommand&>(command).getKernelNames()[i].c_str();
function(ACTIVITY_DOMAIN_HIP_OPS, operation_id, &record);
}
} else {
+34 -14
Dosyayı Görüntüle
@@ -105,7 +105,7 @@ class Event : public RuntimeObject {
struct ProfilingInfo {
ProfilingInfo(bool enabled = false)
: enabled_(enabled), marker_ts_(false), multiple_ts_(false) {
: enabled_(enabled), marker_ts_(false) {
if (enabled) {
clear();
correlation_id_ = activity_prof::correlation_id;
@@ -116,20 +116,17 @@ class Event : public RuntimeObject {
uint64_t submitted_;
uint64_t start_;
uint64_t end_;
std::vector<std::pair<uint64_t, uint64_t>> tsList_;
uint64_t correlation_id_;
bool enabled_; //!< Profiling enabled for the wave limiter
bool marker_ts_; //!< TS marker
bool multiple_ts_; //!< Multiple TS
void clear() {
queued_ = 0ULL;
submitted_ = 0ULL;
start_ = 0ULL;
end_ = 0ULL;
tsList_.clear();
}
}
} profilingInfo_;
//! Construct a new event.
@@ -225,11 +222,6 @@ class Event : public RuntimeObject {
//! Set release scope for the event
void setEventScope(int32_t scope) { event_scope_ = scope; }
//! Add a timestamp to the list
void AddTimeStamps(uint64_t start, uint64_t end) {
profilingInfo_.tsList_.push_back(std::make_pair(start, end));
}
};
union CopyMetadata {
@@ -1263,17 +1255,45 @@ class Marker : public Command {
class AccumulateCommand : public Command {
private:
uint8_t* lastPacket_;
//! Kernel names and timestamps list for activity profiling
std::vector<std::string> kernelNames_;
std::vector<std::pair<uint64_t, uint64_t>> tsList_;
public:
//! Create a new Marker
AccumulateCommand(HostQueue& queue, const EventWaitList& eventWaitList = nullWaitList,
const Event* waitingEvent = nullptr, uint8_t* lastPacket = nullptr)
: Command(queue, CL_COMMAND_TASK, eventWaitList, 0, waitingEvent),
lastPacket_(lastPacket)
{
profilingInfo_.multiple_ts_ = true;
}
// Return last packet
{}
//! Return last packet
uint8_t* getLastPacket() const { return lastPacket_; }
//! Add kernel name to the list if available
void addKernelName(const std::string& kernelName) {
if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
kernelNames_.push_back(kernelName);
}
}
//! Add kernel timestamp to the list if available
void addTimestamps(uint64_t startTs, uint64_t endTs) {
if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
tsList_.push_back(std::make_pair(startTs, endTs));
}
}
//! Return the kernel names
const std::vector<std::string>& getKernelNames() const {
return kernelNames_;
}
//! Return the kernel timestamps
const std::vector<std::pair<uint64_t, uint64_t>>& getTimestamps() const {
return tsList_;
}
//! The command implementation
virtual void submit(device::VirtualDevice& device) {
device.submitAccumulate(*this);