diff --git a/rocclr/device/pal/palcapturemgr.hpp b/rocclr/device/pal/palcapturemgr.hpp index 20307df8b9..2bfa2e0df2 100644 --- a/rocclr/device/pal/palcapturemgr.hpp +++ b/rocclr/device/pal/palcapturemgr.hpp @@ -29,6 +29,227 @@ class Device; class VirtualGPU; class HSAILKernel; +// ================================================================================================ +// RgpSqttMarkerIdentifier - Identifiers for RGP SQ thread-tracing markers (Table 1) +enum RgpSqttMarkerIdentifier : uint32_t { + RgpSqttMarkerIdentifierEvent = 0x0, + RgpSqttMarkerIdentifierCbStart = 0x1, + RgpSqttMarkerIdentifierCbEnd = 0x2, + RgpSqttMarkerIdentifierBarrierStart = 0x3, + RgpSqttMarkerIdentifierBarrierEnd = 0x4, + RgpSqttMarkerIdentifierUserEvent = 0x5, + RgpSqttMarkerIdentifierGeneralApi = 0x6, + RgpSqttMarkerIdentifierSync = 0x7, + RgpSqttMarkerIdentifierPresent = 0x8, + RgpSqttMarkerIdentifierLayoutTransition = 0x9, + RgpSqttMarkerIdentifierRenderPass = 0xA, + RgpSqttMarkerIdentifierReserved2 = 0xB, + RgpSqttMarkerIdentifierBindPipeline = 0xC, + RgpSqttMarkerIdentifierReserved4 = 0xD, + RgpSqttMarkerIdentifierReserved5 = 0xE, + RgpSqttMarkerIdentifierReserved6 = 0xF +}; + +// ================================================================================================ +enum class RgpSqttMarkerEventType : uint32_t { + CmdNDRangeKernel = 0, + CmdScheduler = 1, + CmdCopyBuffer = 2, + CmdCopyImageToBuffer = 3, + CmdCopyBufferToImage = 4, + CmdFillBuffer = 5, + CmdCopyImage = 6, + CmdFillImage = 7, + CmdPipelineBarrier = 8, + InternalUnknown = 26, + Invalid = 0xffffffff +}; + +// ================================================================================================ +// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker. +// These are generated ahead of draws or dispatches for commands that trigger generation of waves +// i.e. draws/dispatches (Table 4). +struct RgpSqttMarkerEvent { + union { + struct { + uint32_t identifier : 4; // Identifier for this marker + uint32_t extDwords : 3; // Number of extra dwords following this marker + uint32_t apiType : 24; // The API type for this command + uint32_t hasThreadDims : 1; // Whether thread dimensions are included + }; + + uint32_t dword01; // The first dword + }; + + union { + // Some information about the vertex/instance/draw register indices. These values are not + // always valid because they are not available for one reason or another: + // + // - If vertex offset index or instance offset index are not (together) valid, they are both + // equal to 0 + // - If draw index is not valid, it is equal to the vertex offset index + struct { + uint32_t cbID : 20; // Command buffer ID for this marker + uint32_t vertexOffsetRegIdx : 4; // SPI userdata register index for the first vertex offset + uint32_t + instanceOffsetRegIdx : 4; // SPI userdata register index for the first instance offset + uint32_t drawIndexRegIdx : 4; // SPI userdata register index for the draw index (multi draw + // indirect) + }; + uint32_t dword02; // The second dword + }; + + union { + uint32_t cmdID; // Command index within the command buffer + uint32_t dword03; // The third dword + }; +}; + +// ================================================================================================ +// RgpSqttMarkerEventWithDims - Per-dispatch specific marker where workgroup dims are included +struct RgpSqttMarkerEventWithDims { + RgpSqttMarkerEvent + event; // Per-draw/dispatch marker. API type should be Dispatch, threadDim = 1 + uint32_t threadX; // Work group count in X + uint32_t threadY; // Work group count in Y + uint32_t threadZ; // Work group count in Z +}; + +// ================================================================================================ +// RgpSqttMarkerBarrierStart - "Barrier Start" RGP SQTT instrumentation marker (Table 5) +struct RgpSqttMarkerBarrierStart { + union { + struct { + uint32_t identifier : 4; // Identifier for this marker + uint32_t extDwords : 3; // Number of extra dwords following this marker + uint32_t cbId : 20; // Command buffer ID within queue + uint32_t reserved : 5; // Reserved + }; + + uint32_t dword01; // The first dword + }; + + union { + struct { + uint32_t driverReason : 31; + uint32_t internal : 1; + }; + + uint32_t dword02; // The second dword + }; +}; + +// ================================================================================================ +// RgpSqttMarkerBarrierEnd - "Barrier End" RGP SQTT instrumentation marker (Table 6) +struct RgpSqttMarkerBarrierEnd { + union { + struct { + uint32_t identifier : 4; // Identifier for this marker + uint32_t extDwords : 3; // Number of extra dwords following this marker + uint32_t cbId : 20; // Command buffer ID within queue + uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that + // timestamp to be written. Quintessential full pipeline stall. + uint32_t vsPartialFlush : 1; // Stall at ME waiting for all prior VS waves to complete. + uint32_t psPartialFlush : 1; // Stall at ME waiting for all prior PS waves to complete. + uint32_t csPartialFlush : 1; // Stall at ME waiting for all prior CS waves to complete. + uint32_t pfpSyncMe : 1; // Stall PFP until ME is at same point in command stream. + }; + + uint32_t dword01; // The first dword + }; + + union { + struct { + uint32_t + syncCpDma : 1; // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed. + uint32_t invalTcp : 1; // Invalidate the L1 vector caches. + uint32_t invalSqI : 1; // Invalidate the SQ instruction caches + uint32_t invalSqK : 1; // Invalidate the SQ constant caches (i.e. L1 scalar caches) + uint32_t flushTcc : 1; // Flush L2 + uint32_t invalTcc : 1; // Invalidate L2 + uint32_t flushCb : 1; // Flush CB caches (including DCC, cmask, fmask) + uint32_t invalCb : 1; // Invalidate CB caches (including DCC, cmask, fmask) + uint32_t flushDb : 1; // Flush DB caches (including htile) + uint32_t invalDb : 1; // Invalidate DB caches (including htile) + uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet + uint32_t reserved : 6; // Reserved for future expansion. Always 0 + }; + + uint32_t dword02; // The second dword + }; +}; + +// ================================================================================================ +// RgpSqttMarkerPipelineBind - RGP SQ thread-tracing marker written whenever a pipeline is bound (Table 12). +struct RgpSqttMarkerPipelineBind { + union { + struct { + uint32_t identifier : 4; // Identifier for this marker + uint32_t extDwords : 3; // Number of extra dwords following this marker + uint32_t bindPoint : 1; // The bind point of the pipeline within a queue + // 0 = graphics bind point + // 1 = compute bind point + uint32_t cbID : 20; // A command buffer ID encoded as per Table 13. + uint32_t reserved : 4; // Reserved + }; + + uint32_t dword01; // The first dword + }; + + union { + uint32_t apiPsoHash[2]; // The API PSO hash of the pipeline being bound + struct { + uint32_t dword02; // The second dword + uint32_t dword03; // The third dword + }; + }; +}; + +// RGP SQTT Instrumentation Specification version (API-independent) +constexpr uint32_t RgpSqttInstrumentationSpecVersion = 1; + +// RGP SQTT Instrumentation Specification version for Vulkan-specific tables +constexpr uint32_t RgpSqttInstrumentationApiVersion = 0; + +// RgpSqttMarkerUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user +// event +enum RgpSqttMarkerUserEventType : uint32_t { + RgpSqttMarkerUserEventTrigger = 0x0, + RgpSqttMarkerUserEventPop = 0x1, + RgpSqttMarkerUserEventPush = 0x2, + RgpSqttMarkerUserEventObjectName = 0x3, + RgpSqttMarkerUserEventReserved1 = 0x4, + RgpSqttMarkerUserEventReserved2 = 0x5, + RgpSqttMarkerUserEventReserved3 = 0x6, + RgpSqttMarkerUserEventReserved4 = 0x7, +}; + +// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event. +union RgpSqttMarkerUserEvent { + struct { + uint32_t identifier : 4; // Identifier for this marker + uint32_t extDwords : 8; // Number of extra dwords following this marker + uint32_t dataType : 8; // The type for this marker + uint32_t reserved : 12; // reserved + }; + + uint32_t dword01; // The first dword +}; + +constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1; + +// The max lengths of frame marker strings +static constexpr size_t RgpSqttMaxUserEventStringLengthInDwords = 1024; + +// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and +// trigger data types) +struct RgpSqttMarkerUserEventWithString { + RgpSqttMarkerUserEvent header; + + uint32_t stringLength; // Length of the string (in characters) + uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format +}; + // ================================================================================================ class ICaptureMgr { public: diff --git a/rocclr/device/pal/palgpuopen.cpp b/rocclr/device/pal/palgpuopen.cpp index a72b2d818c..9ed597656e 100644 --- a/rocclr/device/pal/palgpuopen.cpp +++ b/rocclr/device/pal/palgpuopen.cpp @@ -453,7 +453,8 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size WriteComputeBindMarker(gpu, kernel.prog().ApiHash()); WriteUserEventMarker(gpu, RgpSqttMarkerUserEventObjectName, kernel.name()); - // Write disaptch marker + + // Write dispatch marker WriteEventWithDimsMarker(gpu, apiEvent, static_cast(x), static_cast(y), static_cast(z)); } diff --git a/rocclr/device/pal/palgpuopen.hpp b/rocclr/device/pal/palgpuopen.hpp index 1bb1985732..ab5136dfbf 100644 --- a/rocclr/device/pal/palgpuopen.hpp +++ b/rocclr/device/pal/palgpuopen.hpp @@ -89,227 +89,6 @@ class HandlerServer; } // namespace DevDriver namespace amd::pal { -// ================================================================================================ -// RgpSqttMarkerIdentifier - Identifiers for RGP SQ thread-tracing markers (Table 1) -enum RgpSqttMarkerIdentifier : uint32_t { - RgpSqttMarkerIdentifierEvent = 0x0, - RgpSqttMarkerIdentifierCbStart = 0x1, - RgpSqttMarkerIdentifierCbEnd = 0x2, - RgpSqttMarkerIdentifierBarrierStart = 0x3, - RgpSqttMarkerIdentifierBarrierEnd = 0x4, - RgpSqttMarkerIdentifierUserEvent = 0x5, - RgpSqttMarkerIdentifierGeneralApi = 0x6, - RgpSqttMarkerIdentifierSync = 0x7, - RgpSqttMarkerIdentifierPresent = 0x8, - RgpSqttMarkerIdentifierLayoutTransition = 0x9, - RgpSqttMarkerIdentifierRenderPass = 0xA, - RgpSqttMarkerIdentifierReserved2 = 0xB, - RgpSqttMarkerIdentifierBindPipeline = 0xC, - RgpSqttMarkerIdentifierReserved4 = 0xD, - RgpSqttMarkerIdentifierReserved5 = 0xE, - RgpSqttMarkerIdentifierReserved6 = 0xF -}; - -// ================================================================================================ -enum class RgpSqttMarkerEventType : uint32_t { - CmdNDRangeKernel = 0, - CmdScheduler = 1, - CmdCopyBuffer = 2, - CmdCopyImageToBuffer = 3, - CmdCopyBufferToImage = 4, - CmdFillBuffer = 5, - CmdCopyImage = 6, - CmdFillImage = 7, - CmdPipelineBarrier = 8, - InternalUnknown = 26, - Invalid = 0xffffffff -}; - -// ================================================================================================ -// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker. -// These are generated ahead of draws or dispatches for commands that trigger generation of waves -// i.e. draws/dispatches (Table 4). -struct RgpSqttMarkerEvent { - union { - struct { - uint32_t identifier : 4; // Identifier for this marker - uint32_t extDwords : 3; // Number of extra dwords following this marker - uint32_t apiType : 24; // The API type for this command - uint32_t hasThreadDims : 1; // Whether thread dimensions are included - }; - - uint32_t dword01; // The first dword - }; - - union { - // Some information about the vertex/instance/draw register indices. These values are not - // always valid because they are not available for one reason or another: - // - // - If vertex offset index or instance offset index are not (together) valid, they are both - // equal to 0 - // - If draw index is not valid, it is equal to the vertex offset index - struct { - uint32_t cbID : 20; // Command buffer ID for this marker - uint32_t vertexOffsetRegIdx : 4; // SPI userdata register index for the first vertex offset - uint32_t - instanceOffsetRegIdx : 4; // SPI userdata register index for the first instance offset - uint32_t drawIndexRegIdx : 4; // SPI userdata register index for the draw index (multi draw - // indirect) - }; - uint32_t dword02; // The second dword - }; - - union { - uint32_t cmdID; // Command index within the command buffer - uint32_t dword03; // The third dword - }; -}; - -// ================================================================================================ -// RgpSqttMarkerEventWithDims - Per-dispatch specific marker where workgroup dims are included -struct RgpSqttMarkerEventWithDims { - RgpSqttMarkerEvent - event; // Per-draw/dispatch marker. API type should be Dispatch, threadDim = 1 - uint32_t threadX; // Work group count in X - uint32_t threadY; // Work group count in Y - uint32_t threadZ; // Work group count in Z -}; - -// ================================================================================================ -// RgpSqttMarkerBarrierStart - "Barrier Start" RGP SQTT instrumentation marker (Table 5) -struct RgpSqttMarkerBarrierStart { - union { - struct { - uint32_t identifier : 4; // Identifier for this marker - uint32_t extDwords : 3; // Number of extra dwords following this marker - uint32_t cbId : 20; // Command buffer ID within queue - uint32_t reserved : 5; // Reserved - }; - - uint32_t dword01; // The first dword - }; - - union { - struct { - uint32_t driverReason : 31; - uint32_t internal : 1; - }; - - uint32_t dword02; // The second dword - }; -}; - -// ================================================================================================ -// RgpSqttMarkerBarrierEnd - "Barrier End" RGP SQTT instrumentation marker (Table 6) -struct RgpSqttMarkerBarrierEnd { - union { - struct { - uint32_t identifier : 4; // Identifier for this marker - uint32_t extDwords : 3; // Number of extra dwords following this marker - uint32_t cbId : 20; // Command buffer ID within queue - uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that - // timestamp to be written. Quintessential full pipeline stall. - uint32_t vsPartialFlush : 1; // Stall at ME waiting for all prior VS waves to complete. - uint32_t psPartialFlush : 1; // Stall at ME waiting for all prior PS waves to complete. - uint32_t csPartialFlush : 1; // Stall at ME waiting for all prior CS waves to complete. - uint32_t pfpSyncMe : 1; // Stall PFP until ME is at same point in command stream. - }; - - uint32_t dword01; // The first dword - }; - - union { - struct { - uint32_t - syncCpDma : 1; // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed. - uint32_t invalTcp : 1; // Invalidate the L1 vector caches. - uint32_t invalSqI : 1; // Invalidate the SQ instruction caches - uint32_t invalSqK : 1; // Invalidate the SQ constant caches (i.e. L1 scalar caches) - uint32_t flushTcc : 1; // Flush L2 - uint32_t invalTcc : 1; // Invalidate L2 - uint32_t flushCb : 1; // Flush CB caches (including DCC, cmask, fmask) - uint32_t invalCb : 1; // Invalidate CB caches (including DCC, cmask, fmask) - uint32_t flushDb : 1; // Flush DB caches (including htile) - uint32_t invalDb : 1; // Invalidate DB caches (including htile) - uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet - uint32_t reserved : 6; // Reserved for future expansion. Always 0 - }; - - uint32_t dword02; // The second dword - }; -}; - -// ================================================================================================ -// RgpSqttMarkerPipelineBind - RGP SQ thread-tracing marker written whenever a pipeline is bound (Table 12). -struct RgpSqttMarkerPipelineBind { - union { - struct { - uint32_t identifier : 4; // Identifier for this marker - uint32_t extDwords : 3; // Number of extra dwords following this marker - uint32_t bindPoint : 1; // The bind point of the pipeline within a queue - // 0 = graphics bind point - // 1 = compute bind point - uint32_t cbID : 20; // A command buffer ID encoded as per Table 13. - uint32_t reserved : 4; // Reserved - }; - - uint32_t dword01; // The first dword - }; - - union { - uint32_t apiPsoHash[2]; // The API PSO hash of the pipeline being bound - struct { - uint32_t dword02; // The second dword - uint32_t dword03; // The third dword - }; - }; -}; - - -// RGP SQTT Instrumentation Specification version (API-independent) -constexpr uint32_t RgpSqttInstrumentationSpecVersion = 1; - -// RGP SQTT Instrumentation Specification version for Vulkan-specific tables -constexpr uint32_t RgpSqttInstrumentationApiVersion = 0; - -// RgpSqttMarkeUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user -// event -enum RgpSqttMarkerUserEventType : uint32_t { - RgpSqttMarkerUserEventTrigger = 0x0, - RgpSqttMarkerUserEventPop = 0x1, - RgpSqttMarkerUserEventPush = 0x2, - RgpSqttMarkerUserEventObjectName = 0x3, - RgpSqttMarkerUserEventReserved1 = 0x4, - RgpSqttMarkerUserEventReserved2 = 0x5, - RgpSqttMarkerUserEventReserved3 = 0x6, - RgpSqttMarkerUserEventReserved4 = 0x7, -}; - -// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event. -union RgpSqttMarkerUserEvent { - struct { - uint32_t identifier : 4; // Identifier for this marker - uint32_t extDwords : 8; // Number of extra dwords following this marker - uint32_t dataType : 8; // The type for this marker - uint32_t reserved : 12; // reserved - }; - - uint32_t dword01; // The first dword -}; - -constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1; - -// The max lengths of frame marker strings -static constexpr size_t RgpSqttMaxUserEventStringLengthInDwords = 1024; - -// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and -// trigger data types) -struct RgpSqttMarkerUserEventWithString { - RgpSqttMarkerUserEvent header; - - uint32_t stringLength; // Length of the string (in characters) - uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format -}; // ================================================================================================ // This class provides functionality to interact with the GPU Open Developer Mode message passing diff --git a/rocclr/device/pal/palubercapturemgr.cpp b/rocclr/device/pal/palubercapturemgr.cpp index 52b615e560..bbf882bde2 100644 --- a/rocclr/device/pal/palubercapturemgr.cpp +++ b/rocclr/device/pal/palubercapturemgr.cpp @@ -20,6 +20,10 @@ #include "device/pal/palubercapturemgr.hpp" #include "device/pal/paldevice.hpp" +#include "device/pal/palvirtual.hpp" +#include "device/pal/palprogram.hpp" +#include "device/pal/palkernel.hpp" +#include "device/pal/palblit.hpp" #include "palPlatform.h" #include "palTraceSession.h" @@ -58,6 +62,8 @@ UberTraceCaptureMgr::UberTraceCaptureMgr(Pal::IPlatform* platform, const Device& : device_(device), dev_driver_server_(platform->GetDevDriverServer()), global_disp_count_(1), // Must start from 1 according to RGP spec + user_event_(nullptr), + current_event_id_(0), trace_session_(platform->GetTraceSession()), trace_controller_(nullptr), code_object_trace_source_(nullptr), @@ -74,6 +80,12 @@ bool UberTraceCaptureMgr::CreateUberTraceResources(Pal::IPlatform* platform) { bool success = false; do { + // Create the user event RGP marker + user_event_ = new RgpSqttMarkerUserEventWithString; + if (user_event_ == nullptr) { + break; + } + // Initialize the renderop trace controller trace_controller_ = new GpuUtil::RenderOpTraceController(platform, device_.iDev()); if (trace_controller_ == nullptr) { @@ -115,7 +127,11 @@ bool UberTraceCaptureMgr::CreateUberTraceResources(Pal::IPlatform* platform) { // ================================================================================================ void UberTraceCaptureMgr::DestroyUberTraceResources() { - // Deallocate and unregister all created trace controllers & trace sources + // RGP user event marker + if (user_event_ != nullptr) { + delete user_event_; + user_event_ = nullptr; + } // RenderOp TraceController if (trace_controller_ != nullptr) { @@ -169,6 +185,39 @@ void UberTraceCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel trace_controller_->RecordRenderOp(pQueue, GpuUtil::RenderOpTraceController::RenderOp::RenderOpDispatch); + if (trace_session_->GetTraceSessionState() == GpuUtil::TraceSessionState::Running) { + RgpSqttMarkerEventType apiEvent = RgpSqttMarkerEventType::CmdNDRangeKernel; + + if (kernel.prog().isInternal()) { + constexpr RgpSqttMarkerEventType ApiEvents[KernelBlitManager::BlitTotal] = { + RgpSqttMarkerEventType::CmdCopyImage, + RgpSqttMarkerEventType::CmdCopyImage, + RgpSqttMarkerEventType::CmdCopyImageToBuffer, + RgpSqttMarkerEventType::CmdCopyBufferToImage, + RgpSqttMarkerEventType::CmdCopyBuffer, + RgpSqttMarkerEventType::CmdCopyBuffer, + RgpSqttMarkerEventType::CmdCopyBuffer, + RgpSqttMarkerEventType::CmdCopyBuffer, + RgpSqttMarkerEventType::CmdFillBuffer, + RgpSqttMarkerEventType::CmdFillImage, + RgpSqttMarkerEventType::CmdScheduler}; + + for (uint i = 0; i < KernelBlitManager::BlitTotal; ++i) { + if (kernel.name().compare(BlitName[i]) == 0) { + apiEvent = ApiEvents[i]; + break; + } + } + } + + // Write the hash value + WriteComputeBindMarker(gpu, kernel.prog().ApiHash()); + + // Write dispatch marker + WriteEventWithDimsMarker(gpu, apiEvent, static_cast(x), static_cast(y), + static_cast(z)); + } + // Increment the global dispatch counter global_disp_count_++; } @@ -204,16 +253,6 @@ bool UberTraceCaptureMgr::IsQueueTimingActive() const { (queue_timings_trace_source_->IsTimingInProgress())); } -// ================================================================================================ -void UberTraceCaptureMgr::WriteBarrierStartMarker(const VirtualGPU* gpu, - const Pal::Developer::BarrierData& data) const { -} - -// ================================================================================================ -void UberTraceCaptureMgr::WriteBarrierEndMarker(const VirtualGPU* gpu, - const Pal::Developer::BarrierData& data) const { -} - // ================================================================================================ bool UberTraceCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue, @@ -290,4 +329,105 @@ uint64_t UberTraceCaptureMgr::AddElfBinary(const void* exe_binary, size_t exe_bi return elfBinaryInfo.originalHash; } +// ================================================================================================ +void UberTraceCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, + size_t data_size) const { + assert((data_size % sizeof(uint32_t)) == 0); + assert((data_size / sizeof(uint32_t)) > 0); + + Pal::RgpMarkerSubQueueFlags subQueueFlags = {}; + subQueueFlags.includeMainSubQueue = 1; + + gpu->queue(MainEngine).iCmd()->CmdInsertRgpTraceMarker( + subQueueFlags, static_cast(data_size / sizeof(uint32_t)), data); +} + +// ================================================================================================ +// Inserts a compute bind marker +void UberTraceCaptureMgr::WriteComputeBindMarker(const VirtualGPU* gpu, uint64_t api_hash) const { + RgpSqttMarkerPipelineBind marker = {}; + marker.identifier = RgpSqttMarkerIdentifierBindPipeline; + marker.cbID = gpu->queue(MainEngine).cmdBufId(); + marker.bindPoint = 1; + + memcpy(marker.apiPsoHash, &api_hash, sizeof(api_hash)); + WriteMarker(gpu, &marker, sizeof(marker)); +} + +// ================================================================================================ +// Inserts an RGP pre-dispatch marker +void UberTraceCaptureMgr::WriteEventWithDimsMarker(const VirtualGPU* gpu, + RgpSqttMarkerEventType apiType, + uint32_t x, uint32_t y, uint32_t z) const { + assert(apiType != RgpSqttMarkerEventType::Invalid); + + RgpSqttMarkerEvent event = {}; + event.identifier = RgpSqttMarkerIdentifierEvent; + event.apiType = static_cast(apiType); + event.cmdID = current_event_id_++; + event.cbID = gpu->queue(MainEngine).cmdBufId(); + + RgpSqttMarkerEventWithDims eventWithDims = {}; + eventWithDims.event = event; + eventWithDims.event.hasThreadDims = 1; + eventWithDims.threadX = x; + eventWithDims.threadY = y; + eventWithDims.threadZ = z; + + WriteMarker(gpu, &eventWithDims, sizeof(eventWithDims)); +} + +// ================================================================================================ +void UberTraceCaptureMgr::WriteBarrierStartMarker(const VirtualGPU* gpu, + const Pal::Developer::BarrierData& data) const { + if (trace_session_->GetTraceSessionState() == GpuUtil::TraceSessionState::Running) { + amd::ScopedLock traceLock(&trace_mutex_); + + RgpSqttMarkerBarrierStart marker = {}; + marker.cbId = gpu->queue(MainEngine).cmdBufId(); + marker.identifier = RgpSqttMarkerIdentifierBarrierStart; + marker.internal = true; + marker.dword02 = data.reason; + + WriteMarker(gpu, &marker, sizeof(marker)); + } +} + +// ================================================================================================ +void UberTraceCaptureMgr::WriteBarrierEndMarker(const VirtualGPU* gpu, + const Pal::Developer::BarrierData& data) const { + if (trace_session_->GetTraceSessionState() == GpuUtil::TraceSessionState::Running) { + amd::ScopedLock traceLock(&trace_mutex_); + + // Copy the operations part and include the same data from previous markers + // within the same barrier sequence to create a full picture of all cache + // syncs and pipeline stalls. + Pal::Developer::BarrierOperations operations = data.operations; + operations.pipelineStalls.u16All |= 0; + operations.caches.u16All |= 0; + + RgpSqttMarkerBarrierEnd marker = {}; + marker.identifier = RgpSqttMarkerIdentifierBarrierEnd; + marker.cbId = gpu->queue(MainEngine).cmdBufId(); + marker.numLayoutTransitions = 0; + marker.waitOnEopTs = operations.pipelineStalls.eopTsBottomOfPipe; + marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush; + marker.psPartialFlush = operations.pipelineStalls.psPartialFlush; + marker.csPartialFlush = operations.pipelineStalls.csPartialFlush; + marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe; + marker.syncCpDma = operations.pipelineStalls.syncCpDma; + marker.invalTcp = operations.caches.invalTcp; + marker.invalSqI = operations.caches.invalSqI$; + marker.invalSqK = operations.caches.invalSqK$; + marker.flushTcc = operations.caches.flushTcc; + marker.invalTcc = operations.caches.invalTcc; + marker.flushCb = operations.caches.flushCb; + marker.invalCb = operations.caches.invalCb; + marker.flushDb = operations.caches.flushDb; + marker.invalDb = operations.caches.invalDb; + + WriteMarker(gpu, &marker, sizeof(marker)); + } +} + } // namespace amd::pal diff --git a/rocclr/device/pal/palubercapturemgr.hpp b/rocclr/device/pal/palubercapturemgr.hpp index 35950ce8f0..f29902f646 100644 --- a/rocclr/device/pal/palubercapturemgr.hpp +++ b/rocclr/device/pal/palubercapturemgr.hpp @@ -21,6 +21,7 @@ #pragma once #include "device/pal/palcapturemgr.hpp" +#include "thread/monitor.hpp" namespace DevDriver { @@ -80,10 +81,18 @@ class UberTraceCaptureMgr final : public ICaptureMgr { bool CreateUberTraceResources(Pal::IPlatform* platform); void DestroyUberTraceResources(); + void WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const; + void WriteComputeBindMarker(const VirtualGPU* gpu, uint64_t api_hash) const; + void WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType, uint32_t x, + uint32_t y, uint32_t z) const; + const Device& device_; DevDriver::DevDriverServer* dev_driver_server_; uint64_t global_disp_count_; + RgpSqttMarkerUserEventWithString* user_event_; + mutable uint32_t current_event_id_; + mutable amd::Monitor trace_mutex_; GpuUtil::TraceSession* trace_session_; GpuUtil::RenderOpTraceController* trace_controller_; GpuUtil::CodeObjectTraceSource* code_object_trace_source_;