SWDEV-489003 - [Ubertrace] OCL/HIP profiles are missing event instrumentation

Adds UberTrace support for pre-dispatch markers and barrier begin/end markers.

Moves shared definitions out of palgpuopen.hpp into shared header
palcapturemgr.hpp.

Change-Id: I9f464c689e7ff12c54eca043fc1ad65e1836a64f
This commit is contained in:
Daniel Livingston
2024-10-24 17:15:57 -04:00
committato da Rakesh Roy
parent c6e25b2be7
commit 541c449ce2
5 ha cambiato i file con 383 aggiunte e 233 eliminazioni
+221
Vedi File
@@ -29,6 +29,227 @@ class Device;
class VirtualGPU;
class HSAILKernel;
// ================================================================================================
// RgpSqttMarkerIdentifier - Identifiers for RGP SQ thread-tracing markers (Table 1)
enum RgpSqttMarkerIdentifier : uint32_t {
RgpSqttMarkerIdentifierEvent = 0x0,
RgpSqttMarkerIdentifierCbStart = 0x1,
RgpSqttMarkerIdentifierCbEnd = 0x2,
RgpSqttMarkerIdentifierBarrierStart = 0x3,
RgpSqttMarkerIdentifierBarrierEnd = 0x4,
RgpSqttMarkerIdentifierUserEvent = 0x5,
RgpSqttMarkerIdentifierGeneralApi = 0x6,
RgpSqttMarkerIdentifierSync = 0x7,
RgpSqttMarkerIdentifierPresent = 0x8,
RgpSqttMarkerIdentifierLayoutTransition = 0x9,
RgpSqttMarkerIdentifierRenderPass = 0xA,
RgpSqttMarkerIdentifierReserved2 = 0xB,
RgpSqttMarkerIdentifierBindPipeline = 0xC,
RgpSqttMarkerIdentifierReserved4 = 0xD,
RgpSqttMarkerIdentifierReserved5 = 0xE,
RgpSqttMarkerIdentifierReserved6 = 0xF
};
// ================================================================================================
enum class RgpSqttMarkerEventType : uint32_t {
CmdNDRangeKernel = 0,
CmdScheduler = 1,
CmdCopyBuffer = 2,
CmdCopyImageToBuffer = 3,
CmdCopyBufferToImage = 4,
CmdFillBuffer = 5,
CmdCopyImage = 6,
CmdFillImage = 7,
CmdPipelineBarrier = 8,
InternalUnknown = 26,
Invalid = 0xffffffff
};
// ================================================================================================
// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.
// These are generated ahead of draws or dispatches for commands that trigger generation of waves
// i.e. draws/dispatches (Table 4).
struct RgpSqttMarkerEvent {
union {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t apiType : 24; // The API type for this command
uint32_t hasThreadDims : 1; // Whether thread dimensions are included
};
uint32_t dword01; // The first dword
};
union {
// Some information about the vertex/instance/draw register indices. These values are not
// always valid because they are not available for one reason or another:
//
// - If vertex offset index or instance offset index are not (together) valid, they are both
// equal to 0
// - If draw index is not valid, it is equal to the vertex offset index
struct {
uint32_t cbID : 20; // Command buffer ID for this marker
uint32_t vertexOffsetRegIdx : 4; // SPI userdata register index for the first vertex offset
uint32_t
instanceOffsetRegIdx : 4; // SPI userdata register index for the first instance offset
uint32_t drawIndexRegIdx : 4; // SPI userdata register index for the draw index (multi draw
// indirect)
};
uint32_t dword02; // The second dword
};
union {
uint32_t cmdID; // Command index within the command buffer
uint32_t dword03; // The third dword
};
};
// ================================================================================================
// RgpSqttMarkerEventWithDims - Per-dispatch specific marker where workgroup dims are included
struct RgpSqttMarkerEventWithDims {
RgpSqttMarkerEvent
event; // Per-draw/dispatch marker. API type should be Dispatch, threadDim = 1
uint32_t threadX; // Work group count in X
uint32_t threadY; // Work group count in Y
uint32_t threadZ; // Work group count in Z
};
// ================================================================================================
// RgpSqttMarkerBarrierStart - "Barrier Start" RGP SQTT instrumentation marker (Table 5)
struct RgpSqttMarkerBarrierStart {
union {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t cbId : 20; // Command buffer ID within queue
uint32_t reserved : 5; // Reserved
};
uint32_t dword01; // The first dword
};
union {
struct {
uint32_t driverReason : 31;
uint32_t internal : 1;
};
uint32_t dword02; // The second dword
};
};
// ================================================================================================
// RgpSqttMarkerBarrierEnd - "Barrier End" RGP SQTT instrumentation marker (Table 6)
struct RgpSqttMarkerBarrierEnd {
union {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t cbId : 20; // Command buffer ID within queue
uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that
// timestamp to be written. Quintessential full pipeline stall.
uint32_t vsPartialFlush : 1; // Stall at ME waiting for all prior VS waves to complete.
uint32_t psPartialFlush : 1; // Stall at ME waiting for all prior PS waves to complete.
uint32_t csPartialFlush : 1; // Stall at ME waiting for all prior CS waves to complete.
uint32_t pfpSyncMe : 1; // Stall PFP until ME is at same point in command stream.
};
uint32_t dword01; // The first dword
};
union {
struct {
uint32_t
syncCpDma : 1; // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed.
uint32_t invalTcp : 1; // Invalidate the L1 vector caches.
uint32_t invalSqI : 1; // Invalidate the SQ instruction caches
uint32_t invalSqK : 1; // Invalidate the SQ constant caches (i.e. L1 scalar caches)
uint32_t flushTcc : 1; // Flush L2
uint32_t invalTcc : 1; // Invalidate L2
uint32_t flushCb : 1; // Flush CB caches (including DCC, cmask, fmask)
uint32_t invalCb : 1; // Invalidate CB caches (including DCC, cmask, fmask)
uint32_t flushDb : 1; // Flush DB caches (including htile)
uint32_t invalDb : 1; // Invalidate DB caches (including htile)
uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet
uint32_t reserved : 6; // Reserved for future expansion. Always 0
};
uint32_t dword02; // The second dword
};
};
// ================================================================================================
// RgpSqttMarkerPipelineBind - RGP SQ thread-tracing marker written whenever a pipeline is bound (Table 12).
struct RgpSqttMarkerPipelineBind {
union {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t bindPoint : 1; // The bind point of the pipeline within a queue
// 0 = graphics bind point
// 1 = compute bind point
uint32_t cbID : 20; // A command buffer ID encoded as per Table 13.
uint32_t reserved : 4; // Reserved
};
uint32_t dword01; // The first dword
};
union {
uint32_t apiPsoHash[2]; // The API PSO hash of the pipeline being bound
struct {
uint32_t dword02; // The second dword
uint32_t dword03; // The third dword
};
};
};
// RGP SQTT Instrumentation Specification version (API-independent)
constexpr uint32_t RgpSqttInstrumentationSpecVersion = 1;
// RGP SQTT Instrumentation Specification version for Vulkan-specific tables
constexpr uint32_t RgpSqttInstrumentationApiVersion = 0;
// RgpSqttMarkerUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user
// event
enum RgpSqttMarkerUserEventType : uint32_t {
RgpSqttMarkerUserEventTrigger = 0x0,
RgpSqttMarkerUserEventPop = 0x1,
RgpSqttMarkerUserEventPush = 0x2,
RgpSqttMarkerUserEventObjectName = 0x3,
RgpSqttMarkerUserEventReserved1 = 0x4,
RgpSqttMarkerUserEventReserved2 = 0x5,
RgpSqttMarkerUserEventReserved3 = 0x6,
RgpSqttMarkerUserEventReserved4 = 0x7,
};
// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event.
union RgpSqttMarkerUserEvent {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 8; // Number of extra dwords following this marker
uint32_t dataType : 8; // The type for this marker
uint32_t reserved : 12; // reserved
};
uint32_t dword01; // The first dword
};
constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1;
// The max lengths of frame marker strings
static constexpr size_t RgpSqttMaxUserEventStringLengthInDwords = 1024;
// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and
// trigger data types)
struct RgpSqttMarkerUserEventWithString {
RgpSqttMarkerUserEvent header;
uint32_t stringLength; // Length of the string (in characters)
uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format
};
// ================================================================================================
class ICaptureMgr {
public:
+2 -1
Vedi File
@@ -453,7 +453,8 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size
WriteComputeBindMarker(gpu, kernel.prog().ApiHash());
WriteUserEventMarker(gpu, RgpSqttMarkerUserEventObjectName, kernel.name());
// Write disaptch marker
// Write dispatch marker
WriteEventWithDimsMarker(gpu, apiEvent, static_cast<uint32_t>(x), static_cast<uint32_t>(y),
static_cast<uint32_t>(z));
}
-221
Vedi File
@@ -89,227 +89,6 @@ class HandlerServer;
} // namespace DevDriver
namespace amd::pal {
// ================================================================================================
// RgpSqttMarkerIdentifier - Identifiers for RGP SQ thread-tracing markers (Table 1)
enum RgpSqttMarkerIdentifier : uint32_t {
RgpSqttMarkerIdentifierEvent = 0x0,
RgpSqttMarkerIdentifierCbStart = 0x1,
RgpSqttMarkerIdentifierCbEnd = 0x2,
RgpSqttMarkerIdentifierBarrierStart = 0x3,
RgpSqttMarkerIdentifierBarrierEnd = 0x4,
RgpSqttMarkerIdentifierUserEvent = 0x5,
RgpSqttMarkerIdentifierGeneralApi = 0x6,
RgpSqttMarkerIdentifierSync = 0x7,
RgpSqttMarkerIdentifierPresent = 0x8,
RgpSqttMarkerIdentifierLayoutTransition = 0x9,
RgpSqttMarkerIdentifierRenderPass = 0xA,
RgpSqttMarkerIdentifierReserved2 = 0xB,
RgpSqttMarkerIdentifierBindPipeline = 0xC,
RgpSqttMarkerIdentifierReserved4 = 0xD,
RgpSqttMarkerIdentifierReserved5 = 0xE,
RgpSqttMarkerIdentifierReserved6 = 0xF
};
// ================================================================================================
enum class RgpSqttMarkerEventType : uint32_t {
CmdNDRangeKernel = 0,
CmdScheduler = 1,
CmdCopyBuffer = 2,
CmdCopyImageToBuffer = 3,
CmdCopyBufferToImage = 4,
CmdFillBuffer = 5,
CmdCopyImage = 6,
CmdFillImage = 7,
CmdPipelineBarrier = 8,
InternalUnknown = 26,
Invalid = 0xffffffff
};
// ================================================================================================
// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.
// These are generated ahead of draws or dispatches for commands that trigger generation of waves
// i.e. draws/dispatches (Table 4).
struct RgpSqttMarkerEvent {
union {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t apiType : 24; // The API type for this command
uint32_t hasThreadDims : 1; // Whether thread dimensions are included
};
uint32_t dword01; // The first dword
};
union {
// Some information about the vertex/instance/draw register indices. These values are not
// always valid because they are not available for one reason or another:
//
// - If vertex offset index or instance offset index are not (together) valid, they are both
// equal to 0
// - If draw index is not valid, it is equal to the vertex offset index
struct {
uint32_t cbID : 20; // Command buffer ID for this marker
uint32_t vertexOffsetRegIdx : 4; // SPI userdata register index for the first vertex offset
uint32_t
instanceOffsetRegIdx : 4; // SPI userdata register index for the first instance offset
uint32_t drawIndexRegIdx : 4; // SPI userdata register index for the draw index (multi draw
// indirect)
};
uint32_t dword02; // The second dword
};
union {
uint32_t cmdID; // Command index within the command buffer
uint32_t dword03; // The third dword
};
};
// ================================================================================================
// RgpSqttMarkerEventWithDims - Per-dispatch specific marker where workgroup dims are included
struct RgpSqttMarkerEventWithDims {
RgpSqttMarkerEvent
event; // Per-draw/dispatch marker. API type should be Dispatch, threadDim = 1
uint32_t threadX; // Work group count in X
uint32_t threadY; // Work group count in Y
uint32_t threadZ; // Work group count in Z
};
// ================================================================================================
// RgpSqttMarkerBarrierStart - "Barrier Start" RGP SQTT instrumentation marker (Table 5)
struct RgpSqttMarkerBarrierStart {
union {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t cbId : 20; // Command buffer ID within queue
uint32_t reserved : 5; // Reserved
};
uint32_t dword01; // The first dword
};
union {
struct {
uint32_t driverReason : 31;
uint32_t internal : 1;
};
uint32_t dword02; // The second dword
};
};
// ================================================================================================
// RgpSqttMarkerBarrierEnd - "Barrier End" RGP SQTT instrumentation marker (Table 6)
struct RgpSqttMarkerBarrierEnd {
union {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t cbId : 20; // Command buffer ID within queue
uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that
// timestamp to be written. Quintessential full pipeline stall.
uint32_t vsPartialFlush : 1; // Stall at ME waiting for all prior VS waves to complete.
uint32_t psPartialFlush : 1; // Stall at ME waiting for all prior PS waves to complete.
uint32_t csPartialFlush : 1; // Stall at ME waiting for all prior CS waves to complete.
uint32_t pfpSyncMe : 1; // Stall PFP until ME is at same point in command stream.
};
uint32_t dword01; // The first dword
};
union {
struct {
uint32_t
syncCpDma : 1; // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed.
uint32_t invalTcp : 1; // Invalidate the L1 vector caches.
uint32_t invalSqI : 1; // Invalidate the SQ instruction caches
uint32_t invalSqK : 1; // Invalidate the SQ constant caches (i.e. L1 scalar caches)
uint32_t flushTcc : 1; // Flush L2
uint32_t invalTcc : 1; // Invalidate L2
uint32_t flushCb : 1; // Flush CB caches (including DCC, cmask, fmask)
uint32_t invalCb : 1; // Invalidate CB caches (including DCC, cmask, fmask)
uint32_t flushDb : 1; // Flush DB caches (including htile)
uint32_t invalDb : 1; // Invalidate DB caches (including htile)
uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet
uint32_t reserved : 6; // Reserved for future expansion. Always 0
};
uint32_t dword02; // The second dword
};
};
// ================================================================================================
// RgpSqttMarkerPipelineBind - RGP SQ thread-tracing marker written whenever a pipeline is bound (Table 12).
struct RgpSqttMarkerPipelineBind {
union {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 3; // Number of extra dwords following this marker
uint32_t bindPoint : 1; // The bind point of the pipeline within a queue
// 0 = graphics bind point
// 1 = compute bind point
uint32_t cbID : 20; // A command buffer ID encoded as per Table 13.
uint32_t reserved : 4; // Reserved
};
uint32_t dword01; // The first dword
};
union {
uint32_t apiPsoHash[2]; // The API PSO hash of the pipeline being bound
struct {
uint32_t dword02; // The second dword
uint32_t dword03; // The third dword
};
};
};
// RGP SQTT Instrumentation Specification version (API-independent)
constexpr uint32_t RgpSqttInstrumentationSpecVersion = 1;
// RGP SQTT Instrumentation Specification version for Vulkan-specific tables
constexpr uint32_t RgpSqttInstrumentationApiVersion = 0;
// RgpSqttMarkeUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user
// event
enum RgpSqttMarkerUserEventType : uint32_t {
RgpSqttMarkerUserEventTrigger = 0x0,
RgpSqttMarkerUserEventPop = 0x1,
RgpSqttMarkerUserEventPush = 0x2,
RgpSqttMarkerUserEventObjectName = 0x3,
RgpSqttMarkerUserEventReserved1 = 0x4,
RgpSqttMarkerUserEventReserved2 = 0x5,
RgpSqttMarkerUserEventReserved3 = 0x6,
RgpSqttMarkerUserEventReserved4 = 0x7,
};
// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event.
union RgpSqttMarkerUserEvent {
struct {
uint32_t identifier : 4; // Identifier for this marker
uint32_t extDwords : 8; // Number of extra dwords following this marker
uint32_t dataType : 8; // The type for this marker
uint32_t reserved : 12; // reserved
};
uint32_t dword01; // The first dword
};
constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1;
// The max lengths of frame marker strings
static constexpr size_t RgpSqttMaxUserEventStringLengthInDwords = 1024;
// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and
// trigger data types)
struct RgpSqttMarkerUserEventWithString {
RgpSqttMarkerUserEvent header;
uint32_t stringLength; // Length of the string (in characters)
uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format
};
// ================================================================================================
// This class provides functionality to interact with the GPU Open Developer Mode message passing
+151 -11
Vedi File
@@ -20,6 +20,10 @@
#include "device/pal/palubercapturemgr.hpp"
#include "device/pal/paldevice.hpp"
#include "device/pal/palvirtual.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palkernel.hpp"
#include "device/pal/palblit.hpp"
#include "palPlatform.h"
#include "palTraceSession.h"
@@ -58,6 +62,8 @@ UberTraceCaptureMgr::UberTraceCaptureMgr(Pal::IPlatform* platform, const Device&
: device_(device),
dev_driver_server_(platform->GetDevDriverServer()),
global_disp_count_(1), // Must start from 1 according to RGP spec
user_event_(nullptr),
current_event_id_(0),
trace_session_(platform->GetTraceSession()),
trace_controller_(nullptr),
code_object_trace_source_(nullptr),
@@ -74,6 +80,12 @@ bool UberTraceCaptureMgr::CreateUberTraceResources(Pal::IPlatform* platform) {
bool success = false;
do {
// Create the user event RGP marker
user_event_ = new RgpSqttMarkerUserEventWithString;
if (user_event_ == nullptr) {
break;
}
// Initialize the renderop trace controller
trace_controller_ = new GpuUtil::RenderOpTraceController(platform, device_.iDev());
if (trace_controller_ == nullptr) {
@@ -115,7 +127,11 @@ bool UberTraceCaptureMgr::CreateUberTraceResources(Pal::IPlatform* platform) {
// ================================================================================================
void UberTraceCaptureMgr::DestroyUberTraceResources() {
// Deallocate and unregister all created trace controllers & trace sources
// RGP user event marker
if (user_event_ != nullptr) {
delete user_event_;
user_event_ = nullptr;
}
// RenderOp TraceController
if (trace_controller_ != nullptr) {
@@ -169,6 +185,39 @@ void UberTraceCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel
trace_controller_->RecordRenderOp(pQueue,
GpuUtil::RenderOpTraceController::RenderOp::RenderOpDispatch);
if (trace_session_->GetTraceSessionState() == GpuUtil::TraceSessionState::Running) {
RgpSqttMarkerEventType apiEvent = RgpSqttMarkerEventType::CmdNDRangeKernel;
if (kernel.prog().isInternal()) {
constexpr RgpSqttMarkerEventType ApiEvents[KernelBlitManager::BlitTotal] = {
RgpSqttMarkerEventType::CmdCopyImage,
RgpSqttMarkerEventType::CmdCopyImage,
RgpSqttMarkerEventType::CmdCopyImageToBuffer,
RgpSqttMarkerEventType::CmdCopyBufferToImage,
RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdFillBuffer,
RgpSqttMarkerEventType::CmdFillImage,
RgpSqttMarkerEventType::CmdScheduler};
for (uint i = 0; i < KernelBlitManager::BlitTotal; ++i) {
if (kernel.name().compare(BlitName[i]) == 0) {
apiEvent = ApiEvents[i];
break;
}
}
}
// Write the hash value
WriteComputeBindMarker(gpu, kernel.prog().ApiHash());
// Write dispatch marker
WriteEventWithDimsMarker(gpu, apiEvent, static_cast<uint32_t>(x), static_cast<uint32_t>(y),
static_cast<uint32_t>(z));
}
// Increment the global dispatch counter
global_disp_count_++;
}
@@ -204,16 +253,6 @@ bool UberTraceCaptureMgr::IsQueueTimingActive() const {
(queue_timings_trace_source_->IsTimingInProgress()));
}
// ================================================================================================
void UberTraceCaptureMgr::WriteBarrierStartMarker(const VirtualGPU* gpu,
const Pal::Developer::BarrierData& data) const {
}
// ================================================================================================
void UberTraceCaptureMgr::WriteBarrierEndMarker(const VirtualGPU* gpu,
const Pal::Developer::BarrierData& data) const {
}
// ================================================================================================
bool UberTraceCaptureMgr::RegisterTimedQueue(uint32_t queue_id,
Pal::IQueue* iQueue,
@@ -290,4 +329,105 @@ uint64_t UberTraceCaptureMgr::AddElfBinary(const void* exe_binary, size_t exe_bi
return elfBinaryInfo.originalHash;
}
// ================================================================================================
void UberTraceCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data,
size_t data_size) const {
assert((data_size % sizeof(uint32_t)) == 0);
assert((data_size / sizeof(uint32_t)) > 0);
Pal::RgpMarkerSubQueueFlags subQueueFlags = {};
subQueueFlags.includeMainSubQueue = 1;
gpu->queue(MainEngine).iCmd()->CmdInsertRgpTraceMarker(
subQueueFlags, static_cast<uint32_t>(data_size / sizeof(uint32_t)), data);
}
// ================================================================================================
// Inserts a compute bind marker
void UberTraceCaptureMgr::WriteComputeBindMarker(const VirtualGPU* gpu, uint64_t api_hash) const {
RgpSqttMarkerPipelineBind marker = {};
marker.identifier = RgpSqttMarkerIdentifierBindPipeline;
marker.cbID = gpu->queue(MainEngine).cmdBufId();
marker.bindPoint = 1;
memcpy(marker.apiPsoHash, &api_hash, sizeof(api_hash));
WriteMarker(gpu, &marker, sizeof(marker));
}
// ================================================================================================
// Inserts an RGP pre-dispatch marker
void UberTraceCaptureMgr::WriteEventWithDimsMarker(const VirtualGPU* gpu,
RgpSqttMarkerEventType apiType,
uint32_t x, uint32_t y, uint32_t z) const {
assert(apiType != RgpSqttMarkerEventType::Invalid);
RgpSqttMarkerEvent event = {};
event.identifier = RgpSqttMarkerIdentifierEvent;
event.apiType = static_cast<uint32_t>(apiType);
event.cmdID = current_event_id_++;
event.cbID = gpu->queue(MainEngine).cmdBufId();
RgpSqttMarkerEventWithDims eventWithDims = {};
eventWithDims.event = event;
eventWithDims.event.hasThreadDims = 1;
eventWithDims.threadX = x;
eventWithDims.threadY = y;
eventWithDims.threadZ = z;
WriteMarker(gpu, &eventWithDims, sizeof(eventWithDims));
}
// ================================================================================================
void UberTraceCaptureMgr::WriteBarrierStartMarker(const VirtualGPU* gpu,
const Pal::Developer::BarrierData& data) const {
if (trace_session_->GetTraceSessionState() == GpuUtil::TraceSessionState::Running) {
amd::ScopedLock traceLock(&trace_mutex_);
RgpSqttMarkerBarrierStart marker = {};
marker.cbId = gpu->queue(MainEngine).cmdBufId();
marker.identifier = RgpSqttMarkerIdentifierBarrierStart;
marker.internal = true;
marker.dword02 = data.reason;
WriteMarker(gpu, &marker, sizeof(marker));
}
}
// ================================================================================================
void UberTraceCaptureMgr::WriteBarrierEndMarker(const VirtualGPU* gpu,
const Pal::Developer::BarrierData& data) const {
if (trace_session_->GetTraceSessionState() == GpuUtil::TraceSessionState::Running) {
amd::ScopedLock traceLock(&trace_mutex_);
// Copy the operations part and include the same data from previous markers
// within the same barrier sequence to create a full picture of all cache
// syncs and pipeline stalls.
Pal::Developer::BarrierOperations operations = data.operations;
operations.pipelineStalls.u16All |= 0;
operations.caches.u16All |= 0;
RgpSqttMarkerBarrierEnd marker = {};
marker.identifier = RgpSqttMarkerIdentifierBarrierEnd;
marker.cbId = gpu->queue(MainEngine).cmdBufId();
marker.numLayoutTransitions = 0;
marker.waitOnEopTs = operations.pipelineStalls.eopTsBottomOfPipe;
marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush;
marker.psPartialFlush = operations.pipelineStalls.psPartialFlush;
marker.csPartialFlush = operations.pipelineStalls.csPartialFlush;
marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe;
marker.syncCpDma = operations.pipelineStalls.syncCpDma;
marker.invalTcp = operations.caches.invalTcp;
marker.invalSqI = operations.caches.invalSqI$;
marker.invalSqK = operations.caches.invalSqK$;
marker.flushTcc = operations.caches.flushTcc;
marker.invalTcc = operations.caches.invalTcc;
marker.flushCb = operations.caches.flushCb;
marker.invalCb = operations.caches.invalCb;
marker.flushDb = operations.caches.flushDb;
marker.invalDb = operations.caches.invalDb;
WriteMarker(gpu, &marker, sizeof(marker));
}
}
} // namespace amd::pal
@@ -21,6 +21,7 @@
#pragma once
#include "device/pal/palcapturemgr.hpp"
#include "thread/monitor.hpp"
namespace DevDriver
{
@@ -80,10 +81,18 @@ class UberTraceCaptureMgr final : public ICaptureMgr {
bool CreateUberTraceResources(Pal::IPlatform* platform);
void DestroyUberTraceResources();
void WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const;
void WriteComputeBindMarker(const VirtualGPU* gpu, uint64_t api_hash) const;
void WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType, uint32_t x,
uint32_t y, uint32_t z) const;
const Device& device_;
DevDriver::DevDriverServer* dev_driver_server_;
uint64_t global_disp_count_;
RgpSqttMarkerUserEventWithString* user_event_;
mutable uint32_t current_event_id_;
mutable amd::Monitor trace_mutex_;
GpuUtil::TraceSession* trace_session_;
GpuUtil::RenderOpTraceController* trace_controller_;
GpuUtil::CodeObjectTraceSource* code_object_trace_source_;