Files
rocm-systems/rocclr/runtime/device/pal/palgpuopen.cpp
T
foreman 0212151a49 P4 to Git Change 1704536 by gandryey@gera-w8 on 2018/11/08 15:29:54
SWDEV-79445 - OCL generic changes and code clean-up
	- Fix linux build

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#12 edit
2018-11-08 15:39:14 -05:00

863 líneas
30 KiB
C++

/*
**************************************************************************************************
*
* Trade secret of Advanced Micro Devices, Inc.
* Copyright (c) 2016, Advanced Micro Devices, Inc., (unpublished)
*
* All rights reserved. This notice is intended as a precaution against inadvertent publication
* and does not imply publication or any waiver of confidentiality. The year included in
* the foregoing notice is the year of creation of the work.
*
**************************************************************************************************
*/
#include "device/pal/palgpuopen.hpp"
#include "device/pal/paldevice.hpp"
#include "device/pal/palvirtual.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palkernel.hpp"
#include "device/pal/palblit.hpp"
// PAL headers
#include "palCmdAllocator.h"
#include "palFence.h"
#include "palQueueSemaphore.h"
// gpuutil headers
#include "gpuUtil/palGpaSession.h"
// gpuopen headers
#include "devDriverServer.h"
#include "msgChannel.h"
#include "msgTransport.h"
#include "protocols/rgpServer.h"
#include "protocols/driverControlServer.h"
namespace pal
{
// ================================================================================================
RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device)
:
device_(device),
dev_driver_server_(platform->GetDevDriverServer()),
user_event_(nullptr),
num_prep_disp_(0),
max_sqtt_disp_(device_.settings().rgpSqttDispCount_),
trace_gpu_mem_limit_(0),
global_disp_count_(1), // Must start from 1 according to RGP spec
trace_enabled_(false),
inst_tracing_enabled_(false)
{
memset(&trace_, 0, sizeof(trace_));
}
// ================================================================================================
RgpCaptureMgr::~RgpCaptureMgr()
{
DestroyRGPTracing();
}
// ================================================================================================
// Creates the GPU Open Developer Mode manager class.
RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device)
{
RgpCaptureMgr* mgr = new RgpCaptureMgr(platform, device);
if (mgr != nullptr && !mgr->Init(platform)) {
delete mgr;
mgr = nullptr;
}
return mgr;
}
// ================================================================================================
bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
{
if (dev_driver_server_ == nullptr) {
return false;
}
const Settings& settings = device_.settings();
// Tell RGP that the server (i.e. the driver) supports tracing if requested.
rgp_server_ = dev_driver_server_->GetRGPServer();
if (rgp_server_ == nullptr) {
return false;
}
// Finalize RGP settings
Finalize();
bool result = true;
// Fail initialization of trace resources if SQTT tracing has been force-disabled from
// the panel (this will consequently fail the trace), or if the chosen device's gfxip
// does not support SQTT.
//
// It's necessary to check this during RGP tracing init in addition to devmode init because
// during the earlier devmode init we may be in a situation where some enumerated physical
// devices support tracing and others do not.
if (GpuSupportsTracing(device_.properties(), settings) == false) {
result = false;
}
// Create a GPA session object for this trace session
if (result) {
assert(trace_.gpa_session_ == nullptr);
const uint32_t api_version = settings.oclVersion_;
trace_.gpa_session_ = new GpuUtil::GpaSession(
platform,
device_.iDev(),
api_version >> 4, // OCL API version major
api_version & 0xf, // OCL API version minor
RgpSqttInstrumentationSpecVersion,
RgpSqttInstrumentationApiVersion);
if (trace_.gpa_session_ == nullptr) {
result = false;
}
}
// Initialize the GPA session
if (result && (trace_.gpa_session_->Init() != Pal::Result::Success)) {
result = false;
}
if (result) {
user_event_ = new RgpSqttMarkerUserEventWithString;
if (nullptr == user_event_) {
result = false;
}
}
if (!result) {
// If we've failed to initialize tracing, permanently disable traces
if (rgp_server_ != nullptr) {
rgp_server_->DisableTraces();
trace_enabled_ = false;
}
// Clean up if we failed
DestroyRGPTracing();
} else {
PostDeviceCreate();
}
return result;
}
// ================================================================================================
// This function finds out all the queues in the device that we have to synchronize for RGP-traced
// frames and initializes resources for them.
bool RgpCaptureMgr::RegisterTimedQueue(
uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const
{
bool result = true;
// Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
// it may be optional for Vulkan, but we provide it anyway if available).
Pal::KernelContextInfo kernelContextInfo = {};
Pal::Result palResult = iQueue->QueryKernelContextInfo(&kernelContextInfo);
// Ensure we've acquired the debug VMID (note that some platforms do not
// implement this function, so don't fail the whole trace if so)
*debug_vmid = kernelContextInfo.flags.hasDebugVmid;
// Register the queue with the GPA session class for timed queue operation support.
if (trace_.gpa_session_->RegisterTimedQueue(iQueue, queue_id,
kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
result = false;
}
return result;
}
// ================================================================================================
Pal::Result RgpCaptureMgr::TimedQueueSubmit(
Pal::IQueue* queue,
uint64_t cmdId,
const Pal::SubmitInfo& submitInfo) const
{
// Fill in extra meta-data information to associate the API command buffer data with
// the generated timing information.
GpuUtil::TimedSubmitInfo timedSubmitInfo = {};
Pal::uint64 apiCmdBufIds = cmdId;
Pal::uint32 sqttCmdBufIds = 0;
timedSubmitInfo.pApiCmdBufIds = &apiCmdBufIds;
timedSubmitInfo.pSqttCmdBufIds = &sqttCmdBufIds;
timedSubmitInfo.frameIndex = 0;
// Do a timed submit of all the command buffers
Pal::Result result = trace_.gpa_session_->TimedSubmit(queue, submitInfo, timedSubmitInfo);
// Punt to non-timed submit if a timed submit fails (or is not supported)
if (result != Pal::Result::Success) {
result = queue->Submit(submitInfo);
}
return result;
}
// ================================================================================================
// Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit().
//
// This finalizes the developer driver manager.
void RgpCaptureMgr::Finalize()
{
// Figure out if the gfxip supports tracing. We decide tracing if there is at least one
// enumerated GPU that can support tracing. Since we don't yet know if that GPU will be
// picked as the target of an eventual VkDevice, this check is imperfect.
// In mixed-GPU situations where an unsupported GPU is picked for tracing,
// trace capture will fail with an error.
bool hw_support_tracing = false;
if ((rgp_server_->EnableTraces() == DevDriver::Result::Success)) {
if (GpuSupportsTracing(device_.properties(), device_.settings())) {
hw_support_tracing = true;
}
}
if (hw_support_tracing == false) {
rgp_server_->DisableTraces();
}
// Finalize the devmode manager
dev_driver_server_->Finalize();
// Figure out if tracing support should be enabled or not
trace_enabled_ = (rgp_server_ != nullptr) && rgp_server_->TracesEnabled();
}
// ================================================================================================
// Waits for the driver to be resumed if it's currently paused.
void RgpCaptureMgr::WaitForDriverResume()
{
auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
assert(pDriverControlServer != nullptr);
pDriverControlServer->WaitForDriverResume();
}
// ================================================================================================
// Called before a swap chain presents. This signals a frame-end boundary and
// is used to coordinate RGP trace start/stop.
void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
{
if (rgp_server_->TracesEnabled()) {
// If there's currently a trace running, submit the trace-end command buffer
if (trace_.status_ == TraceStatus::Running) {
amd::ScopedLock traceLock(&trace_mutex_);
trace_.sqtt_disp_count_++;
if (trace_.sqtt_disp_count_ >= max_sqtt_disp_) {
Pal::Result res = EndRGPHardwareTrace(gpu);
if (Pal::Result::ErrorIncompatibleQueue == res) {
// continue until we find the right queue...
}
else if (Pal::Result::Success == res) {
trace_.sqtt_disp_count_ = 0;
} else {
FinishRGPTrace(gpu, true);
}
}
}
if (IsQueueTimingActive()) {
// Call TimedQueuePresent() to insert commands that collect GPU timestamp.
Pal::IQueue* pPalQueue = gpu->queue(MainEngine).iQueue_;
// Currently nothing in the PresentInfo struct is used for inserting a timed present marker.
GpuUtil::TimedQueuePresentInfo timedPresentInfo = {};
//Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo);
//assert(result == Pal::Result::Success);
}
}
}
// ================================================================================================
Pal::Result RgpCaptureMgr::CheckForTraceResults()
{
assert(trace_.status_ == TraceStatus::WaitingForResults);
Pal::Result result = Pal::Result::NotReady;
// Check if trace results are ready
if (trace_.gpa_session_->IsReady() && // GPA session is ready
(trace_.begin_queue_->isDone(&trace_.end_event_))) // "Trace end" cmdbuf has retired
{
bool success = false;
// Fetch required trace data size from GPA session
size_t traceDataSize = 0;
void* pTraceData = nullptr;
trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, nullptr);
// Allocate memory for trace data
if (traceDataSize > 0) {
pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256);
}
if (pTraceData != nullptr) {
// Get trace data from GPA session
if (trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, pTraceData) ==
Pal::Result::Success) {
// Transmit trace data to anyone who's listening
auto devResult = rgp_server_->WriteTraceData(
static_cast<Pal::uint8*>(pTraceData), traceDataSize);
success = (devResult == DevDriver::Result::Success);
}
amd::AlignedMemory::deallocate(pTraceData);
}
if (success) {
result = Pal::Result::Success;
}
}
return result;
}
// ================================================================================================
// Called after a swap chain presents. This signals a (next) frame-begin boundary and is
// used to coordinate RGP trace start/stop.
void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
size_t x, size_t y, size_t z)
{
// Wait for the driver to be resumed in case it's been paused.
WaitForDriverResume();
if (rgp_server_->TracesEnabled()) {
amd::ScopedLock traceLock(&trace_mutex_);
// Check if there's an RGP trace request pending and we're idle
if ((trace_.status_ == TraceStatus::Idle) && rgp_server_->IsTracePending()) {
// Attempt to start preparing for a trace
if (PrepareRGPTrace(gpu) == Pal::Result::Success) {
// Attempt to start the trace immediately if we do not need to prepare
if (num_prep_disp_ == 0) {
if (BeginRGPTrace(gpu) != Pal::Result::Success) {
FinishRGPTrace(gpu, true);
}
}
}
}
else if (trace_.status_ == TraceStatus::Preparing) {
// Wait some number of "preparation frames" before starting the trace in order to get enough
// timer samples to sync CPU/GPU clock domains.
trace_.prepared_disp_count_++;
// Take a calibration timing measurement sample for this frame.
trace_.gpa_session_->SampleTimingClocks();
// Start the SQTT trace if we've waited a sufficient number of preparation frames
if (trace_.prepared_disp_count_ >= num_prep_disp_) {
Pal::Result result = BeginRGPTrace(gpu);
if (Pal::Result::ErrorIncompatibleQueue == result) {
// Let's wait until the app will reach the same queue
} else if (result != Pal::Result::Success) {
FinishRGPTrace(gpu, true);
}
}
}
// Check if we're ending a trace waiting for SQTT to turn off.
// If SQTT has turned off, end the trace
else if (trace_.status_ == TraceStatus::WaitingForSqtt) {
Pal::Result result = Pal::Result::Success;
if (trace_.begin_queue_->isDone(&trace_.end_sqtt_event_)) {
result = EndRGPTrace(gpu);
} else {
// todo: There is a wait inside the trace end for now
result = EndRGPTrace(gpu);
}
if (result != Pal::Result::Success) {
FinishRGPTrace(gpu, true);
}
}
// Check if we're waiting for final trace results.
else if (trace_.status_ == TraceStatus::WaitingForResults) {
Pal::Result result = CheckForTraceResults();
// Results ready: finish trace
if (result == Pal::Result::Success) {
FinishRGPTrace(gpu, false);
}
// Error while computing results: abort trace
else if (result != Pal::Result::NotReady) {
FinishRGPTrace(gpu, true);
}
}
if (trace_.status_ == TraceStatus::Running) {
RgpSqttMarkerEventType apiEvent = RgpSqttMarkerEventType::CmdNDRangeKernel;
if (kernel.prog().isInternal()) {
constexpr RgpSqttMarkerEventType ApiEvents[KernelBlitManager::BlitTotal] = {
RgpSqttMarkerEventType::CmdCopyImage, RgpSqttMarkerEventType::CmdCopyImage,
RgpSqttMarkerEventType::CmdCopyImageToBuffer,
RgpSqttMarkerEventType::CmdCopyBufferToImage,
RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdFillBuffer, RgpSqttMarkerEventType::CmdFillImage,
RgpSqttMarkerEventType::CmdScheduler
};
for (uint i = 0; i < KernelBlitManager::BlitTotal; ++i) {
if (kernel.name().compare(BlitName[i]) == 0) {
apiEvent = ApiEvents[i];
break;
}
}
}
WriteUserEventMarker(gpu, RgpSqttMarkerUserEventObjectName, kernel.name());
// Write disaptch marker
WriteEventWithDimsMarker(gpu, apiEvent,
static_cast<uint32_t>(x), static_cast<uint32_t>(y), static_cast<uint32_t>(z));
}
}
global_disp_count_++;
}
// ================================================================================================
// This function starts preparing for an RGP trace. Preparation involves some N frames of
// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock domains.
//
// This function transitions from the Idle state to the Preparing state.
Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
{
assert(trace_.status_ == TraceStatus::Idle);
// We can only trace using a single device at a time currently, so recreate RGP trace
// resources against this new one if the device is changing.
Pal::Result result = Pal::Result::Success;
const auto traceParameters = rgp_server_->QueryTraceParameters();
num_prep_disp_ = traceParameters.captureStartIndex;
uint32_t capture_disp = traceParameters.captureStopIndex - traceParameters.captureStartIndex;
// Validate if the captured dispatches are in the range
if ((capture_disp > 0) && (capture_disp < max_sqtt_disp_)) {
max_sqtt_disp_ = capture_disp;
}
trace_gpu_mem_limit_ = traceParameters.gpuMemoryLimitInMb * 1024 * 1024;
inst_tracing_enabled_ = traceParameters.flags.enableInstructionTokens;
// Notify the RGP server that we are starting a trace
if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
result = Pal::Result::ErrorUnknown;
}
// Tell the GPA session class we're starting a trace
if (result == Pal::Result::Success) {
GpuUtil::GpaSessionBeginInfo info = {};
info.flags.enableQueueTiming = true;// trace_.queueTimingEnabled;
result = trace_.gpa_session_->Begin(info);
}
trace_.prepared_disp_count_ = 0;
trace_.sqtt_disp_count_ = 0;
// Sample the timing clocks prior to starting a trace.
if (result == Pal::Result::Success) {
trace_.gpa_session_->SampleTimingClocks();
}
if (result == Pal::Result::Success) {
// Remember which queue started the trace
trace_.prepare_queue_ = gpu;
trace_.begin_queue_ = nullptr;
trace_.status_ = TraceStatus::Preparing;
} else {
// We failed to prepare for the trace so abort it.
if (rgp_server_ != nullptr) {
const DevDriver::Result devDriverResult = rgp_server_->AbortTrace();
// AbortTrace should always succeed unless we've used the api incorrectly.
assert(devDriverResult == DevDriver::Result::Success);
}
}
return result;
}
// ================================================================================================
// This function begins an RGP trace by initializing all dependent resources and submitting
// the "begin trace" information command buffer.
//
// This function transitions from the Preparing state to the Running state.
Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
{
assert(trace_.status_ == TraceStatus::Preparing);
assert(trace_enabled_);
// We can only trace using a single device at a time currently, so recreate RGP trace
// resources against this new one if the device is changing.
Pal::Result result = Pal::Result::Success;
if (result == Pal::Result::Success) {
// Only allow trace to start if the queue family at prep-time matches the queue
// family at begin time because the command buffer engine type must match
if (trace_.prepare_queue_ != gpu) {
result = Pal::Result::ErrorIncompatibleQueue;
}
}
// Start a GPA tracing sample with SQTT enabled
if (result == Pal::Result::Success) {
GpuUtil::GpaSampleConfig sampleConfig = {};
sampleConfig.type = GpuUtil::GpaSampleType::Trace;
sampleConfig.sqtt.gpuMemoryLimit = trace_gpu_mem_limit_;
sampleConfig.sqtt.seMask = 0xF;
sampleConfig.sqtt.flags.enable = true;
sampleConfig.sqtt.flags.supressInstructionTokens = (inst_tracing_enabled_ == false);
// Fill GPU commands
gpu->eventBegin(MainEngine);
trace_.gpa_sample_id_ = trace_.gpa_session_->BeginSample(
gpu->queue(MainEngine).iCmd(), sampleConfig);
gpu->eventEnd(MainEngine, trace_.begin_sqtt_event_);
}
// Submit the trace-begin command buffer
if (result == Pal::Result::Success) {
static constexpr bool NeedFlush = true;
// Update the global GPU event
gpu->setGpuEvent(trace_.begin_sqtt_event_, NeedFlush);
}
// Make the trace active and remember which queue started it
if (result == Pal::Result::Success) {
trace_.status_ = TraceStatus::Running;
trace_.begin_queue_ = gpu;
}
return result;
}
// ================================================================================================
// This function submits the command buffer to stop SQTT tracing. Full tracing still continues.
//
// This function transitions from the Running state to the WaitingForSqtt state.
Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu)
{
assert(trace_.status_ == TraceStatus::Running);
Pal::Result result = Pal::Result::Success;
// Only allow SQTT trace to start and end on the same queue because it's critical that these are
// in the same order
if (gpu != trace_.begin_queue_) {
result = Pal::Result::ErrorIncompatibleQueue;
}
// Tell the GPA session to insert any necessary commands to end the tracing sample and
// end the session itself
if (result == Pal::Result::Success) {
assert(trace_.gpa_session_ != nullptr);
// Write CB commands to finish the SQTT
gpu->eventBegin(MainEngine);
trace_.gpa_session_->EndSample(gpu->queue(MainEngine).iCmd(), trace_.gpa_sample_id_);
gpu->eventEnd(MainEngine, trace_.end_sqtt_event_);
static constexpr bool NeedFlush = true;
// Update the global GPU event
gpu->setGpuEvent(trace_.end_sqtt_event_, NeedFlush);
trace_.status_ = TraceStatus::WaitingForSqtt;
// Execute a device wait idle
if (device_.settings().rgpSqttWaitIdle_) {
// Make sure the trace is done. Note: required for SDMA data write back
gpu->waitForEvent(&trace_.end_sqtt_event_);
}
}
return result;
}
// ================================================================================================
// This function ends a running RGP trace.
//
// This function transitions from the WaitingForSqtt state to WaitingForResults state.
Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu)
{
assert(trace_.status_ == TraceStatus::WaitingForSqtt);
Pal::Result result = Pal::Result::Success;
// Tell the GPA session to insert any necessary commands to end the tracing sample and
// end the session itself
if (result == Pal::Result::Success) {
assert(trace_.gpa_session_ != nullptr);
// Initiate SDMA copy
gpu->eventBegin(SdmaEngine);
result = trace_.gpa_session_->End(gpu->queue(SdmaEngine).iCmd());
gpu->eventEnd(SdmaEngine, trace_.end_event_);
}
// Submit the trace-end command buffer
if (result == Pal::Result::Success) {
static constexpr bool NeedFlush = true;
// Update the global GPU event
gpu->setGpuEvent(trace_.end_event_, NeedFlush);
trace_.status_ = TraceStatus::WaitingForResults;
if (device_.settings().rgpSqttWaitIdle_) {
// Make sure the transfer is done
gpu->waitForEvent(&trace_.end_event_);
}
}
return result;
}
// ================================================================================================
// This function resets and possibly cancels a currently active (between begin/end) RGP trace.
// It frees any dependent resources.
void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted)
{
if (trace_.prepare_queue_ == nullptr) {
return;
}
// Finish the trace if the queue was destroyed before
// OCL reached the number of captured dispatches
if ((trace_.sqtt_disp_count_ != 0) && (gpu != nullptr)) {
EndRGPHardwareTrace(gpu);
}
// Inform RGP protocol that we're done with the trace, either by aborting it or finishing normally
if (aborted) {
rgp_server_->AbortTrace();
} else {
rgp_server_->EndTrace();
}
if (trace_.gpa_session_ != nullptr) {
trace_.gpa_session_->Reset();
}
// Reset tracing state to idle
trace_.prepared_disp_count_ = 0;
trace_.sqtt_disp_count_ = 0;
trace_.gpa_sample_id_ = 0;
trace_.status_ = TraceStatus::Idle;
trace_.prepare_queue_ = nullptr;
trace_.begin_queue_ = nullptr;
}
// ================================================================================================
// Destroys device-persistent RGP resources
void RgpCaptureMgr::DestroyRGPTracing()
{
if (trace_.status_ != TraceStatus::Idle) {
FinishRGPTrace(nullptr, true);
}
delete user_event_;
// Destroy the GPA session
if (trace_.gpa_session_ != nullptr) {
//Util::Destructor(trace_.gpa_session_);
delete trace_.gpa_session_;
trace_.gpa_session_ = nullptr;
}
memset(&trace_, 0, sizeof(trace_));
}
// ================================================================================================
// Returns true if the given device properties/settings support tracing.
bool RgpCaptureMgr::GpuSupportsTracing(
const Pal::DeviceProperties& props,
const Settings& settings)
{
return props.gfxipProperties.flags.supportRgpTraces && !settings.rgpSqttForceDisable_;
}
// ================================================================================================
// Called when a new device is created. This will preallocate reusable RGP trace resources
// for that device.
void RgpCaptureMgr::PostDeviceCreate()
{
amd::ScopedLock traceLock(&trace_mutex_);
auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
assert(pDriverControlServer != nullptr);
// If the driver hasn't been marked as fully initialized yet, mark it now.
// We consider the time after the logical device creation to be the fully initialized driver
// position. This is mainly because PAL is fully initialized at this point and we also know
// whether or not the debug vmid has been acquired. External tools use this information to
// decide when it's reasonable to make certain requests of the driver through protocol functions.
if (pDriverControlServer->IsDriverInitialized() == false) {
pDriverControlServer->FinishDriverInitialization();
}
}
// ================================================================================================
// Called prior to a device's being destroyed. This will free persistent RGP trace resources for
// that device.
void RgpCaptureMgr::PreDeviceDestroy()
{
amd::ScopedLock traceLock(&trace_mutex_);
// If we are idle, we can re-initialize trace resources based on the new device.
if (trace_.status_ == TraceStatus::Idle) {
DestroyRGPTracing();
}
}
// ================================================================================================
// Sets up an Event marker's basic data.
RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(
const VirtualGPU* gpu, RgpSqttMarkerEventType api_type) const
{
RgpSqttMarkerEvent marker = {};
marker.identifier = RgpSqttMarkerIdentifierEvent;
marker.apiType = static_cast<uint32_t>(api_type);
marker.cmdID = trace_.current_event_id_++;
marker.cbID = gpu->queue(MainEngine).cmdBufId();
return marker;
}
// ================================================================================================
void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const
{
assert((data_size % sizeof(uint32_t)) == 0);
assert((data_size / sizeof(uint32_t)) > 0);
gpu->queue(MainEngine).iCmd()->CmdInsertRgpTraceMarker(
static_cast<uint32_t>(data_size / sizeof(uint32_t)), data);
}
// ================================================================================================
// Inserts an RGP pre-dispatch marker
void RgpCaptureMgr::WriteEventWithDimsMarker(
const VirtualGPU* gpu,
RgpSqttMarkerEventType apiType,
uint32_t x,
uint32_t y,
uint32_t z) const
{
assert(apiType != RgpSqttMarkerEventType::Invalid);
RgpSqttMarkerEventWithDims eventWithDims = {};
eventWithDims.event = BuildEventMarker(gpu, apiType);
eventWithDims.event.hasThreadDims = 1;
eventWithDims.threadX = x;
eventWithDims.threadY = y;
eventWithDims.threadZ = z;
WriteMarker(gpu, &eventWithDims, sizeof(eventWithDims));
}
// ================================================================================================
void RgpCaptureMgr::WriteBarrierStartMarker(
const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const
{
if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
amd::ScopedLock traceLock(&trace_mutex_);
RgpSqttMarkerBarrierStart marker = {};
marker.identifier = RgpSqttMarkerIdentifierBarrierStart;
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
marker.dword02 = data.reason;
marker.internal = true;
WriteMarker(gpu, &marker, sizeof(marker));
}
}
// ================================================================================================
void RgpCaptureMgr::WriteBarrierEndMarker(
const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const
{
if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
amd::ScopedLock traceLock(&trace_mutex_);
// Copy the operations part and include the same data from previous markers
// within the same barrier sequence to create a full picture of all cache
// syncs and pipeline stalls.
auto operations = data.operations;
operations.pipelineStalls.u16All |= 0;
operations.caches.u16All |= 0;
RgpSqttMarkerBarrierEnd marker = {};
marker.identifier = RgpSqttMarkerIdentifierBarrierEnd;
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
marker.waitOnEopTs = operations.pipelineStalls.waitOnEopTsBottomOfPipe;
marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush;
marker.psPartialFlush = operations.pipelineStalls.psPartialFlush;
marker.csPartialFlush = operations.pipelineStalls.csPartialFlush;
marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe;
marker.syncCpDma = operations.pipelineStalls.syncCpDma;
marker.invalTcp = operations.caches.invalTcp;
marker.invalSqI = operations.caches.invalSqI$;
marker.invalSqK = operations.caches.invalSqK$;
marker.flushTcc = operations.caches.flushTcc;
marker.invalTcc = operations.caches.invalTcc;
marker.flushCb = operations.caches.flushCb;
marker.invalCb = operations.caches.invalCb;
marker.flushDb = operations.caches.flushDb;
marker.invalDb = operations.caches.invalDb;
marker.numLayoutTransitions = 0;
WriteMarker(gpu, &marker, sizeof(marker));
}
}
// ================================================================================================
// Inserts a user event string marker
void RgpCaptureMgr::WriteUserEventMarker(
const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType, const std::string& name) const
{
memset(user_event_, 0, sizeof(RgpSqttMarkerUserEventWithString));
user_event_->header.identifier = RgpSqttMarkerIdentifierUserEvent;
user_event_->header.dataType = eventType;
size_t markerSize = sizeof(user_event_->header);
if ((eventType != RgpSqttMarkerUserEventPop)) {
size_t strLength = std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t));
for (uint32_t charIdx = 0; charIdx < strLength; ++charIdx) {
uint32_t c = static_cast<uint32_t>(name[charIdx]);
user_event_->stringData[charIdx / 4] |= (c << (8 * (charIdx % 4)));
user_event_->stringLength = static_cast<uint32_t>(strLength);
}
// Every data type other than Pop includes a string length
markerSize += sizeof(uint32_t);
// Include string length (padded up to the nearest dword)
markerSize += sizeof(uint32_t) * ((strLength + sizeof(uint32_t) - 1) / sizeof(uint32_t));
}
WriteMarker(gpu, user_event_, markerSize);
}
}; // namespace vk