Comhaid
rocm-systems/rocclr/runtime/device/pal/palgpuopen.cpp
T
foreman 4794298058 P4 to Git Change 2006447 by gandryey@gera-win10 on 2019/09/30 13:32:36
SWDEV-79445 - OCL generic changes and code clean-up
	- Allow to disable GPUOpen build

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/build/Makefile.pal#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbedefs#52 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#167 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#18 edit
2019-09-30 13:41:27 -04:00

825 línte
31 KiB
C++

/*
**************************************************************************************************
*
* Trade secret of Advanced Micro Devices, Inc.
* Copyright (c) 2016, Advanced Micro Devices, Inc., (unpublished)
*
* All rights reserved. This notice is intended as a precaution against inadvertent publication
* and does not imply publication or any waiver of confidentiality. The year included in
* the foregoing notice is the year of creation of the work.
*
**************************************************************************************************
*/
#include "device/pal/palgpuopen.hpp"
#include "device/pal/paldevice.hpp"
#include "device/pal/palvirtual.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palkernel.hpp"
#include "device/pal/palblit.hpp"
// PAL headers
#include "palCmdAllocator.h"
#include "palFence.h"
#include "palQueueSemaphore.h"
#ifdef PAL_GPUOPEN_OCL
// gpuutil headers
#include "gpuUtil/palGpaSession.h"
// gpuopen headers
#include "devDriverServer.h"
#include "msgChannel.h"
#include "msgTransport.h"
#include "protocols/rgpServer.h"
#include "protocols/driverControlServer.h"
namespace pal {
// ================================================================================================
RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device)
: device_(device),
dev_driver_server_(platform->GetDevDriverServer()),
user_event_(nullptr),
num_prep_disp_(0),
max_sqtt_disp_(device_.settings().rgpSqttDispCount_),
trace_gpu_mem_limit_(0),
global_disp_count_(1), // Must start from 1 according to RGP spec
trace_enabled_(false),
inst_tracing_enabled_(false) {
memset(&trace_, 0, sizeof(trace_));
}
// ================================================================================================
RgpCaptureMgr::~RgpCaptureMgr() { DestroyRGPTracing(); }
// ================================================================================================
// Creates the GPU Open Developer Mode manager class.
RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device) {
RgpCaptureMgr* mgr = new RgpCaptureMgr(platform, device);
if (mgr != nullptr && !mgr->Init(platform)) {
delete mgr;
mgr = nullptr;
}
return mgr;
}
// ================================================================================================
bool RgpCaptureMgr::Init(Pal::IPlatform* platform) {
if (dev_driver_server_ == nullptr) {
return false;
}
const Settings& settings = device_.settings();
// Tell RGP that the server (i.e. the driver) supports tracing if requested.
rgp_server_ = dev_driver_server_->GetRGPServer();
if (rgp_server_ == nullptr) {
return false;
}
// Finalize RGP settings
Finalize();
bool result = true;
// Fail initialization of trace resources if SQTT tracing has been force-disabled from
// the panel (this will consequently fail the trace), or if the chosen device's gfxip
// does not support SQTT.
//
// It's necessary to check this during RGP tracing init in addition to devmode init because
// during the earlier devmode init we may be in a situation where some enumerated physical
// devices support tracing and others do not.
if (GpuSupportsTracing(device_.properties(), settings) == false) {
result = false;
}
// Create a GPA session object for this trace session
if (result) {
assert(trace_.gpa_session_ == nullptr);
const uint32_t api_version = settings.oclVersion_;
trace_.gpa_session_ = new GpuUtil::GpaSession(platform, device_.iDev(),
api_version >> 4, // OCL API version major
api_version & 0xf, // OCL API version minor
RgpSqttInstrumentationSpecVersion,
RgpSqttInstrumentationApiVersion);
if (trace_.gpa_session_ == nullptr) {
result = false;
}
}
// Initialize the GPA session
if (result && (trace_.gpa_session_->Init() != Pal::Result::Success)) {
result = false;
}
if (result) {
user_event_ = new RgpSqttMarkerUserEventWithString;
if (nullptr == user_event_) {
result = false;
}
}
if (!result) {
// If we've failed to initialize tracing, permanently disable traces
if (rgp_server_ != nullptr) {
rgp_server_->DisableTraces();
trace_enabled_ = false;
}
// Clean up if we failed
DestroyRGPTracing();
} else {
PostDeviceCreate();
}
return result;
}
// ================================================================================================
// This function finds out all the queues in the device that we have to synchronize for RGP-traced
// frames and initializes resources for them.
bool RgpCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue,
bool* debug_vmid) const {
bool result = true;
// Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
// it may be optional for Vulkan, but we provide it anyway if available).
Pal::KernelContextInfo kernelContextInfo = {};
Pal::Result palResult = iQueue->QueryKernelContextInfo(&kernelContextInfo);
// Ensure we've acquired the debug VMID (note that some platforms do not
// implement this function, so don't fail the whole trace if so)
*debug_vmid = kernelContextInfo.flags.hasDebugVmid;
// Register the queue with the GPA session class for timed queue operation support.
if (trace_.gpa_session_->RegisterTimedQueue(
iQueue, queue_id, kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
result = false;
}
return result;
}
// ================================================================================================
Pal::Result RgpCaptureMgr::TimedQueueSubmit(Pal::IQueue* queue, uint64_t cmdId,
const Pal::SubmitInfo& submitInfo) const {
// Fill in extra meta-data information to associate the API command buffer data with
// the generated timing information.
GpuUtil::TimedSubmitInfo timedSubmitInfo = {};
Pal::uint64 apiCmdBufIds = cmdId;
Pal::uint32 sqttCmdBufIds = 0;
timedSubmitInfo.pApiCmdBufIds = &apiCmdBufIds;
timedSubmitInfo.pSqttCmdBufIds = &sqttCmdBufIds;
timedSubmitInfo.frameIndex = 0;
// Do a timed submit of all the command buffers
Pal::Result result = trace_.gpa_session_->TimedSubmit(queue, submitInfo, timedSubmitInfo);
// Punt to non-timed submit if a timed submit fails (or is not supported)
if (result != Pal::Result::Success) {
result = queue->Submit(submitInfo);
}
return result;
}
// ================================================================================================
// Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit().
//
// This finalizes the developer driver manager.
void RgpCaptureMgr::Finalize() {
// Figure out if the gfxip supports tracing. We decide tracing if there is at least one
// enumerated GPU that can support tracing. Since we don't yet know if that GPU will be
// picked as the target of an eventual VkDevice, this check is imperfect.
// In mixed-GPU situations where an unsupported GPU is picked for tracing,
// trace capture will fail with an error.
bool hw_support_tracing = GpuSupportsTracing(device_.properties(), device_.settings());
if (hw_support_tracing == false) {
rgp_server_->DisableTraces();
}
// Finalize the devmode manager
dev_driver_server_->Finalize();
// Figure out if tracing support should be enabled or not
trace_enabled_ = (rgp_server_ != nullptr) && rgp_server_->TracesEnabled();
}
// ================================================================================================
// Waits for the driver to be resumed if it's currently paused.
void RgpCaptureMgr::WaitForDriverResume() {
auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
assert(pDriverControlServer != nullptr);
pDriverControlServer->DriverTick();
}
// ================================================================================================
// Called before a swap chain presents. This signals a frame-end boundary and
// is used to coordinate RGP trace start/stop.
void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) {
if (rgp_server_->TracesEnabled()) {
// If there's currently a trace running, submit the trace-end command buffer
if (trace_.status_ == TraceStatus::Running) {
amd::ScopedLock traceLock(&trace_mutex_);
trace_.sqtt_disp_count_++;
if (trace_.sqtt_disp_count_ >= max_sqtt_disp_) {
Pal::Result res = EndRGPHardwareTrace(gpu);
if (Pal::Result::ErrorIncompatibleQueue == res) {
// continue until we find the right queue...
} else if (Pal::Result::Success == res) {
trace_.sqtt_disp_count_ = 0;
} else {
FinishRGPTrace(gpu, true);
}
}
}
if (IsQueueTimingActive()) {
// Call TimedQueuePresent() to insert commands that collect GPU timestamp.
Pal::IQueue* pPalQueue = gpu->queue(MainEngine).iQueue_;
// Currently nothing in the PresentInfo struct is used for inserting a timed present marker.
GpuUtil::TimedQueuePresentInfo timedPresentInfo = {};
// Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo);
// assert(result == Pal::Result::Success);
}
}
}
// ================================================================================================
Pal::Result RgpCaptureMgr::CheckForTraceResults() {
assert(trace_.status_ == TraceStatus::WaitingForResults);
Pal::Result result = Pal::Result::NotReady;
// Check if trace results are ready
if (trace_.gpa_session_->IsReady() && // GPA session is ready
(trace_.begin_queue_->isDone(&trace_.end_event_))) // "Trace end" cmdbuf has retired
{
bool success = false;
// Fetch required trace data size from GPA session
size_t traceDataSize = 0;
void* pTraceData = nullptr;
trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, nullptr);
// Allocate memory for trace data
if (traceDataSize > 0) {
pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256);
}
if (pTraceData != nullptr) {
// Get trace data from GPA session
if (trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, pTraceData) ==
Pal::Result::Success) {
// Transmit trace data to anyone who's listening
auto devResult =
rgp_server_->WriteTraceData(static_cast<Pal::uint8*>(pTraceData), traceDataSize);
success = (devResult == DevDriver::Result::Success);
}
amd::AlignedMemory::deallocate(pTraceData);
}
if (success) {
result = Pal::Result::Success;
}
}
return result;
}
// ================================================================================================
// Called after a swap chain presents. This signals a (next) frame-begin boundary and is
// used to coordinate RGP trace start/stop.
void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size_t x, size_t y,
size_t z) {
// Wait for the driver to be resumed in case it's been paused.
WaitForDriverResume();
if (rgp_server_->TracesEnabled()) {
amd::ScopedLock traceLock(&trace_mutex_);
// Check if there's an RGP trace request pending and we're idle
if ((trace_.status_ == TraceStatus::Idle) && rgp_server_->IsTracePending()) {
// Attempt to start preparing for a trace
if (PrepareRGPTrace(gpu) == Pal::Result::Success) {
// Attempt to start the trace immediately if we do not need to prepare
if (num_prep_disp_ == 0) {
if (BeginRGPTrace(gpu) != Pal::Result::Success) {
FinishRGPTrace(gpu, true);
}
}
}
} else if (trace_.status_ == TraceStatus::Preparing) {
// Wait some number of "preparation frames" before starting the trace in order to get enough
// timer samples to sync CPU/GPU clock domains.
trace_.prepared_disp_count_++;
// Take a calibration timing measurement sample for this frame.
trace_.gpa_session_->SampleTimingClocks();
// Start the SQTT trace if we've waited a sufficient number of preparation frames
if (trace_.prepared_disp_count_ >= num_prep_disp_) {
Pal::Result result = BeginRGPTrace(gpu);
if (Pal::Result::ErrorIncompatibleQueue == result) {
// Let's wait until the app will reach the same queue
} else if (result != Pal::Result::Success) {
FinishRGPTrace(gpu, true);
}
}
}
// Check if we're ending a trace waiting for SQTT to turn off.
// If SQTT has turned off, end the trace
else if (trace_.status_ == TraceStatus::WaitingForSqtt) {
Pal::Result result = Pal::Result::Success;
if (trace_.begin_queue_->isDone(&trace_.end_sqtt_event_)) {
result = EndRGPTrace(gpu);
} else {
// todo: There is a wait inside the trace end for now
result = EndRGPTrace(gpu);
}
if (result != Pal::Result::Success) {
FinishRGPTrace(gpu, true);
}
}
// Check if we're waiting for final trace results.
else if (trace_.status_ == TraceStatus::WaitingForResults) {
Pal::Result result = CheckForTraceResults();
// Results ready: finish trace
if (result == Pal::Result::Success) {
FinishRGPTrace(gpu, false);
}
// Error while computing results: abort trace
else if (result != Pal::Result::NotReady) {
FinishRGPTrace(gpu, true);
}
}
if (trace_.status_ == TraceStatus::Running) {
RgpSqttMarkerEventType apiEvent = RgpSqttMarkerEventType::CmdNDRangeKernel;
if (kernel.prog().isInternal()) {
constexpr RgpSqttMarkerEventType ApiEvents[KernelBlitManager::BlitTotal] = {
RgpSqttMarkerEventType::CmdCopyImage,
RgpSqttMarkerEventType::CmdCopyImage,
RgpSqttMarkerEventType::CmdCopyImageToBuffer,
RgpSqttMarkerEventType::CmdCopyBufferToImage,
RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdCopyBuffer,
RgpSqttMarkerEventType::CmdFillBuffer,
RgpSqttMarkerEventType::CmdFillImage,
RgpSqttMarkerEventType::CmdScheduler};
for (uint i = 0; i < KernelBlitManager::BlitTotal; ++i) {
if (kernel.name().compare(BlitName[i]) == 0) {
apiEvent = ApiEvents[i];
break;
}
}
}
WriteUserEventMarker(gpu, RgpSqttMarkerUserEventObjectName, kernel.name());
// Write disaptch marker
WriteEventWithDimsMarker(gpu, apiEvent, static_cast<uint32_t>(x), static_cast<uint32_t>(y),
static_cast<uint32_t>(z));
}
}
global_disp_count_++;
}
// ================================================================================================
// This function starts preparing for an RGP trace. Preparation involves some N frames of
// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock
// domains.
//
// This function transitions from the Idle state to the Preparing state.
Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) {
assert(trace_.status_ == TraceStatus::Idle);
// We can only trace using a single device at a time currently, so recreate RGP trace
// resources against this new one if the device is changing.
Pal::Result result = Pal::Result::Success;
const auto traceParameters = rgp_server_->QueryTraceParameters();
num_prep_disp_ = traceParameters.captureStartIndex;
uint32_t capture_disp = traceParameters.captureStopIndex - traceParameters.captureStartIndex;
// Validate if the captured dispatches are in the range
if ((capture_disp > 0) && (capture_disp < max_sqtt_disp_)) {
max_sqtt_disp_ = capture_disp;
}
trace_gpu_mem_limit_ = traceParameters.gpuMemoryLimitInMb * 1024 * 1024;
inst_tracing_enabled_ = traceParameters.flags.enableInstructionTokens;
// Notify the RGP server that we are starting a trace
if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
result = Pal::Result::ErrorUnknown;
}
// Tell the GPA session class we're starting a trace
if (result == Pal::Result::Success) {
GpuUtil::GpaSessionBeginInfo info = {};
info.flags.enableQueueTiming = true; // trace_.queueTimingEnabled;
result = trace_.gpa_session_->Begin(info);
}
trace_.prepared_disp_count_ = 0;
trace_.sqtt_disp_count_ = 0;
// Sample the timing clocks prior to starting a trace.
if (result == Pal::Result::Success) {
trace_.gpa_session_->SampleTimingClocks();
}
if (result == Pal::Result::Success) {
// Remember which queue started the trace
trace_.prepare_queue_ = gpu;
trace_.begin_queue_ = nullptr;
trace_.status_ = TraceStatus::Preparing;
} else {
// We failed to prepare for the trace so abort it.
if (rgp_server_ != nullptr) {
const DevDriver::Result devDriverResult = rgp_server_->AbortTrace();
// AbortTrace should always succeed unless we've used the api incorrectly.
assert(devDriverResult == DevDriver::Result::Success);
}
}
return result;
}
// ================================================================================================
// This function begins an RGP trace by initializing all dependent resources and submitting
// the "begin trace" information command buffer.
//
// This function transitions from the Preparing state to the Running state.
Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) {
assert(trace_.status_ == TraceStatus::Preparing);
assert(trace_enabled_);
// We can only trace using a single device at a time currently, so recreate RGP trace
// resources against this new one if the device is changing.
Pal::Result result = Pal::Result::Success;
if (result == Pal::Result::Success) {
// Only allow trace to start if the queue family at prep-time matches the queue
// family at begin time because the command buffer engine type must match
if (trace_.prepare_queue_ != gpu) {
result = Pal::Result::ErrorIncompatibleQueue;
}
}
// Start a GPA tracing sample with SQTT enabled
if (result == Pal::Result::Success) {
GpuUtil::GpaSampleConfig sampleConfig = {};
sampleConfig.type = GpuUtil::GpaSampleType::Trace;
sampleConfig.sqtt.gpuMemoryLimit = trace_gpu_mem_limit_;
sampleConfig.sqtt.seMask = 0xF;
sampleConfig.sqtt.flags.enable = true;
sampleConfig.sqtt.flags.supressInstructionTokens = (inst_tracing_enabled_ == false);
// Fill GPU commands
gpu->eventBegin(MainEngine);
trace_.gpa_sample_id_ =
trace_.gpa_session_->BeginSample(gpu->queue(MainEngine).iCmd(), sampleConfig);
gpu->eventEnd(MainEngine, trace_.begin_sqtt_event_);
}
// Submit the trace-begin command buffer
if (result == Pal::Result::Success) {
static constexpr bool NeedFlush = true;
// Update the global GPU event
gpu->setGpuEvent(trace_.begin_sqtt_event_, NeedFlush);
}
// Make the trace active and remember which queue started it
if (result == Pal::Result::Success) {
trace_.status_ = TraceStatus::Running;
trace_.begin_queue_ = gpu;
}
return result;
}
// ================================================================================================
// This function submits the command buffer to stop SQTT tracing. Full tracing still continues.
//
// This function transitions from the Running state to the WaitingForSqtt state.
Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu) {
assert(trace_.status_ == TraceStatus::Running);
Pal::Result result = Pal::Result::Success;
// Only allow SQTT trace to start and end on the same queue because it's critical that these are
// in the same order
if (gpu != trace_.begin_queue_) {
result = Pal::Result::ErrorIncompatibleQueue;
}
// Tell the GPA session to insert any necessary commands to end the tracing sample and
// end the session itself
if (result == Pal::Result::Success) {
assert(trace_.gpa_session_ != nullptr);
// Write CB commands to finish the SQTT
gpu->eventBegin(MainEngine);
trace_.gpa_session_->EndSample(gpu->queue(MainEngine).iCmd(), trace_.gpa_sample_id_);
gpu->eventEnd(MainEngine, trace_.end_sqtt_event_);
static constexpr bool NeedFlush = true;
// Update the global GPU event
gpu->setGpuEvent(trace_.end_sqtt_event_, NeedFlush);
trace_.status_ = TraceStatus::WaitingForSqtt;
// Execute a device wait idle
if (device_.settings().rgpSqttWaitIdle_) {
// Make sure the trace is done. Note: required for SDMA data write back
gpu->waitForEvent(&trace_.end_sqtt_event_);
}
}
return result;
}
// ================================================================================================
// This function ends a running RGP trace.
//
// This function transitions from the WaitingForSqtt state to WaitingForResults state.
Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu) {
assert(trace_.status_ == TraceStatus::WaitingForSqtt);
Pal::Result result = Pal::Result::Success;
// Tell the GPA session to insert any necessary commands to end the tracing sample and
// end the session itself
if (result == Pal::Result::Success) {
assert(trace_.gpa_session_ != nullptr);
EngineType engine = (gpu->dev().settings().disableSdma_) ? MainEngine : SdmaEngine;
// Initiate SDMA copy
gpu->eventBegin(engine);
result = trace_.gpa_session_->End(gpu->queue(engine).iCmd());
gpu->eventEnd(engine, trace_.end_event_);
}
// Submit the trace-end command buffer
if (result == Pal::Result::Success) {
static constexpr bool NeedFlush = true;
// Update the global GPU event
gpu->setGpuEvent(trace_.end_event_, NeedFlush);
trace_.status_ = TraceStatus::WaitingForResults;
if (device_.settings().rgpSqttWaitIdle_) {
// Make sure the transfer is done
gpu->waitForEvent(&trace_.end_event_);
}
}
return result;
}
// ================================================================================================
// This function resets and possibly cancels a currently active (between begin/end) RGP trace.
// It frees any dependent resources.
void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
if (trace_.prepare_queue_ == nullptr) {
return;
}
// Finish the trace if the queue was destroyed before
// OCL reached the number of captured dispatches
if ((trace_.sqtt_disp_count_ != 0) && (gpu != nullptr)) {
EndRGPHardwareTrace(gpu);
}
// Inform RGP protocol that we're done with the trace, either by aborting it or finishing normally
if (aborted) {
rgp_server_->AbortTrace();
} else {
rgp_server_->EndTrace();
}
if (trace_.gpa_session_ != nullptr) {
trace_.gpa_session_->Reset();
}
// Reset tracing state to idle
trace_.prepared_disp_count_ = 0;
trace_.sqtt_disp_count_ = 0;
trace_.gpa_sample_id_ = 0;
trace_.status_ = TraceStatus::Idle;
trace_.prepare_queue_ = nullptr;
trace_.begin_queue_ = nullptr;
}
// ================================================================================================
// Destroys device-persistent RGP resources
void RgpCaptureMgr::DestroyRGPTracing() {
if (trace_.status_ != TraceStatus::Idle) {
FinishRGPTrace(nullptr, true);
}
delete user_event_;
// Destroy the GPA session
if (trace_.gpa_session_ != nullptr) {
// Util::Destructor(trace_.gpa_session_);
delete trace_.gpa_session_;
trace_.gpa_session_ = nullptr;
}
memset(&trace_, 0, sizeof(trace_));
}
// ================================================================================================
// Returns true if the given device properties/settings support tracing.
bool RgpCaptureMgr::GpuSupportsTracing(const Pal::DeviceProperties& props,
const Settings& settings) {
return props.gfxipProperties.flags.supportRgpTraces && !settings.rgpSqttForceDisable_;
}
// ================================================================================================
// Called when a new device is created. This will preallocate reusable RGP trace resources
// for that device.
void RgpCaptureMgr::PostDeviceCreate() {
amd::ScopedLock traceLock(&trace_mutex_);
auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
assert(pDriverControlServer != nullptr);
// If the driver hasn't been marked as fully initialized yet, mark it now.
// We consider the time after the logical device creation to be the fully initialized driver
// position. This is mainly because PAL is fully initialized at this point and we also know
// whether or not the debug vmid has been acquired. External tools use this information to
// decide when it's reasonable to make certain requests of the driver through protocol functions.
if (pDriverControlServer->IsDriverInitialized() == false) {
pDriverControlServer->FinishDeviceInit();
}
}
// ================================================================================================
// Called prior to a device's being destroyed. This will free persistent RGP trace resources for
// that device.
void RgpCaptureMgr::PreDeviceDestroy() {
amd::ScopedLock traceLock(&trace_mutex_);
// If we are idle, we can re-initialize trace resources based on the new device.
if (trace_.status_ == TraceStatus::Idle) {
DestroyRGPTracing();
}
}
// ================================================================================================
// Sets up an Event marker's basic data.
RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(const VirtualGPU* gpu,
RgpSqttMarkerEventType api_type) const {
RgpSqttMarkerEvent marker = {};
marker.identifier = RgpSqttMarkerIdentifierEvent;
marker.apiType = static_cast<uint32_t>(api_type);
marker.cmdID = trace_.current_event_id_++;
marker.cbID = gpu->queue(MainEngine).cmdBufId();
return marker;
}
// ================================================================================================
void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const {
assert((data_size % sizeof(uint32_t)) == 0);
assert((data_size / sizeof(uint32_t)) > 0);
gpu->queue(MainEngine)
.iCmd()
->CmdInsertRgpTraceMarker(static_cast<uint32_t>(data_size / sizeof(uint32_t)), data);
}
// ================================================================================================
// Inserts an RGP pre-dispatch marker
void RgpCaptureMgr::WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType,
uint32_t x, uint32_t y, uint32_t z) const {
assert(apiType != RgpSqttMarkerEventType::Invalid);
RgpSqttMarkerEventWithDims eventWithDims = {};
eventWithDims.event = BuildEventMarker(gpu, apiType);
eventWithDims.event.hasThreadDims = 1;
eventWithDims.threadX = x;
eventWithDims.threadY = y;
eventWithDims.threadZ = z;
WriteMarker(gpu, &eventWithDims, sizeof(eventWithDims));
}
// ================================================================================================
void RgpCaptureMgr::WriteBarrierStartMarker(const VirtualGPU* gpu,
const Pal::Developer::BarrierData& data) const {
if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
amd::ScopedLock traceLock(&trace_mutex_);
RgpSqttMarkerBarrierStart marker = {};
marker.identifier = RgpSqttMarkerIdentifierBarrierStart;
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
marker.dword02 = data.reason;
marker.internal = true;
WriteMarker(gpu, &marker, sizeof(marker));
}
}
// ================================================================================================
void RgpCaptureMgr::WriteBarrierEndMarker(const VirtualGPU* gpu,
const Pal::Developer::BarrierData& data) const {
if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
amd::ScopedLock traceLock(&trace_mutex_);
// Copy the operations part and include the same data from previous markers
// within the same barrier sequence to create a full picture of all cache
// syncs and pipeline stalls.
auto operations = data.operations;
operations.pipelineStalls.u16All |= 0;
operations.caches.u16All |= 0;
RgpSqttMarkerBarrierEnd marker = {};
marker.identifier = RgpSqttMarkerIdentifierBarrierEnd;
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
marker.waitOnEopTs = operations.pipelineStalls.eopTsBottomOfPipe;
marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush;
marker.psPartialFlush = operations.pipelineStalls.psPartialFlush;
marker.csPartialFlush = operations.pipelineStalls.csPartialFlush;
marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe;
marker.syncCpDma = operations.pipelineStalls.syncCpDma;
marker.invalTcp = operations.caches.invalTcp;
marker.invalSqI = operations.caches.invalSqI$;
marker.invalSqK = operations.caches.invalSqK$;
marker.flushTcc = operations.caches.flushTcc;
marker.invalTcc = operations.caches.invalTcc;
marker.flushCb = operations.caches.flushCb;
marker.invalCb = operations.caches.invalCb;
marker.flushDb = operations.caches.flushDb;
marker.invalDb = operations.caches.invalDb;
marker.numLayoutTransitions = 0;
WriteMarker(gpu, &marker, sizeof(marker));
}
}
// ================================================================================================
// Inserts a user event string marker
void RgpCaptureMgr::WriteUserEventMarker(const VirtualGPU* gpu,
RgpSqttMarkerUserEventType eventType,
const std::string& name) const {
memset(user_event_, 0, sizeof(RgpSqttMarkerUserEventWithString));
user_event_->header.identifier = RgpSqttMarkerIdentifierUserEvent;
user_event_->header.dataType = eventType;
size_t markerSize = sizeof(user_event_->header);
if ((eventType != RgpSqttMarkerUserEventPop)) {
size_t strLength =
std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t));
for (uint32_t charIdx = 0; charIdx < strLength; ++charIdx) {
uint32_t c = static_cast<uint32_t>(name[charIdx]);
user_event_->stringData[charIdx / 4] |= (c << (8 * (charIdx % 4)));
user_event_->stringLength = static_cast<uint32_t>(strLength);
}
// Every data type other than Pop includes a string length
markerSize += sizeof(uint32_t);
// Include string length (padded up to the nearest dword)
markerSize += sizeof(uint32_t) * ((strLength + sizeof(uint32_t) - 1) / sizeof(uint32_t));
}
WriteMarker(gpu, user_event_, markerSize);
}
} // namespace pal
#endif // PAL_GPUOPEN_OCL