rocm-systems/rocclr/runtime/device/pal/palgpuopen.cpp

/*
 **************************************************************************************************
 *
 *  Trade secret of Advanced Micro Devices, Inc.
 *  Copyright (c) 2016, Advanced Micro Devices, Inc., (unpublished)
 *
 *  All rights reserved. This notice is intended as a precaution against inadvertent publication
 *  and does not imply  publication or any waiver of confidentiality. The year included in
 *  the foregoing notice is the year of creation of the work.
 *
 **************************************************************************************************
 */
#include "device/pal/palgpuopen.hpp"
#include "device/pal/paldevice.hpp"
#include "device/pal/palvirtual.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palkernel.hpp"
#include "device/pal/palblit.hpp"

// PAL headers
#include "palCmdAllocator.h"
#include "palFence.h"
#include "palQueueSemaphore.h"

// gpuutil headers
#include "gpuUtil/palGpaSession.h"

// gpuopen headers
#include "devDriverServer.h"
#include "msgChannel.h"
#include "msgTransport.h"
#include "protocols/rgpServer.h"
#include "protocols/driverControlServer.h"

namespace pal
{
// ================================================================================================
RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device)
  :
  device_(device),
  dev_driver_server_(platform->GetDevDriverServer()),
  user_event_(nullptr),
  num_prep_disp_(0),
  max_sqtt_disp_(device_.settings().rgpSqttDispCount_),
  trace_gpu_mem_limit_(0),
  global_disp_count_(1),      // Must start from 1 according to RGP spec
  trace_enabled_(false),
  inst_tracing_enabled_(false)
{
  memset(&trace_, 0, sizeof(trace_));
}

// ================================================================================================
RgpCaptureMgr::~RgpCaptureMgr()
{
  DestroyRGPTracing();
}

// ================================================================================================
// Creates the GPU Open Developer Mode manager class.
RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device)
{
  RgpCaptureMgr* mgr = new RgpCaptureMgr(platform, device);

  if (mgr != nullptr && !mgr->Init(platform)) {
    delete mgr;
    mgr = nullptr;
  }

  return mgr;
}

// ================================================================================================
bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
{
  if (dev_driver_server_ == nullptr) {
    return false;
  }
  const Settings& settings = device_.settings();
  // Tell RGP that the server (i.e. the driver) supports tracing if requested.
  rgp_server_ = dev_driver_server_->GetRGPServer();
  if (rgp_server_ == nullptr) {
    return false;
  }

  // Finalize RGP settings
  Finalize();

  bool result = true;

  // Fail initialization of trace resources if SQTT tracing has been force-disabled from
  // the panel (this will consequently fail the trace), or if the chosen device's gfxip
  // does not support SQTT.
  //
  // It's necessary to check this during RGP tracing init in addition to devmode init because
  // during the earlier devmode init we may be in a situation where some enumerated physical
  // devices support tracing and others do not.
  if (GpuSupportsTracing(device_.properties(), settings) == false) {
    result = false;
  }

  // Create a GPA session object for this trace session
  if (result) {
    assert(trace_.gpa_session_ == nullptr);

    const uint32_t api_version = settings.oclVersion_;

    trace_.gpa_session_ = new GpuUtil::GpaSession(
        platform,
        device_.iDev(),
        api_version >> 4,   // OCL API version major
        api_version & 0xf,  // OCL API version minor
        RgpSqttInstrumentationSpecVersion,
        RgpSqttInstrumentationApiVersion);

    if (trace_.gpa_session_ == nullptr) {
      result = false;
    }
  }

  // Initialize the GPA session
  if (result &&  (trace_.gpa_session_->Init() != Pal::Result::Success)) {
    result = false;
  }

  if (result) {
    user_event_ = new RgpSqttMarkerUserEventWithString;
    if (nullptr == user_event_) {
      result = false;
    }
  }

  if (!result) {
    // If we've failed to initialize tracing, permanently disable traces
    if (rgp_server_ != nullptr) {
        rgp_server_->DisableTraces();

        trace_enabled_ = false;
    }

    // Clean up if we failed
    DestroyRGPTracing();
  } else {
    PostDeviceCreate();
  }

  return result;
}

// ================================================================================================
// This function finds out all the queues in the device that we have to synchronize for RGP-traced
// frames and initializes resources for them.
bool RgpCaptureMgr::RegisterTimedQueue(
  uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const
{
  bool result = true;

  // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
  // it may be optional for Vulkan, but we provide it anyway if available).
  Pal::KernelContextInfo kernelContextInfo = {};

  Pal::Result palResult = iQueue->QueryKernelContextInfo(&kernelContextInfo);

  // Ensure we've acquired the debug VMID (note that some platforms do not
  // implement this function, so don't fail the whole trace if so)
  *debug_vmid = kernelContextInfo.flags.hasDebugVmid;

  // Register the queue with the GPA session class for timed queue operation support.
  if (trace_.gpa_session_->RegisterTimedQueue(iQueue, queue_id,
      kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
    result = false;
  }

  return result;
}

// ================================================================================================
Pal::Result RgpCaptureMgr::TimedQueueSubmit(
  Pal::IQueue*  queue,
  uint64_t      cmdId,
  const Pal::SubmitInfo& submitInfo) const
{
  // Fill in extra meta-data information to associate the API command buffer data with
  // the generated timing information.
  GpuUtil::TimedSubmitInfo timedSubmitInfo = {};
  Pal::uint64 apiCmdBufIds = cmdId;
  Pal::uint32 sqttCmdBufIds = 0;

  timedSubmitInfo.pApiCmdBufIds = &apiCmdBufIds;
  timedSubmitInfo.pSqttCmdBufIds = &sqttCmdBufIds;
  timedSubmitInfo.frameIndex = 0;

  // Do a timed submit of all the command buffers
  Pal::Result result = trace_.gpa_session_->TimedSubmit(queue, submitInfo, timedSubmitInfo);

  // Punt to non-timed submit if a timed submit fails (or is not supported)
  if (result != Pal::Result::Success) {
    result = queue->Submit(submitInfo);
  }

  return result;
}

// ================================================================================================
// Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit().
//
// This finalizes the developer driver manager.
void RgpCaptureMgr::Finalize()
{
  // Figure out if the gfxip supports tracing.  We decide tracing if there is at least one
  // enumerated GPU that can support tracing.  Since we don't yet know if that GPU will be
  // picked as the target of an eventual VkDevice, this check is imperfect.
  // In mixed-GPU situations where an unsupported GPU is picked for tracing,
  // trace capture will fail with an error.
  bool hw_support_tracing = false;

  if ((rgp_server_->EnableTraces() == DevDriver::Result::Success)) {
   if (GpuSupportsTracing(device_.properties(), device_.settings())) {
     hw_support_tracing = true;
    }
  }

  if (hw_support_tracing == false) {
    rgp_server_->DisableTraces();
  }

  // Finalize the devmode manager
  dev_driver_server_->Finalize();

  // Figure out if tracing support should be enabled or not
  trace_enabled_ = (rgp_server_ != nullptr) && rgp_server_->TracesEnabled();
}


// ================================================================================================
// Waits for the driver to be resumed if it's currently paused.
void RgpCaptureMgr::WaitForDriverResume()
{
    auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();

    assert(pDriverControlServer != nullptr);

    pDriverControlServer->WaitForDriverResume();
}

// ================================================================================================
// Called before a swap chain presents.  This signals a frame-end boundary and
// is used to coordinate RGP trace start/stop.
void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
{
  if (rgp_server_->TracesEnabled()) {
    // If there's currently a trace running, submit the trace-end command buffer
    if (trace_.status_ == TraceStatus::Running) {
      amd::ScopedLock traceLock(&trace_mutex_);
      trace_.sqtt_disp_count_++;
      if (trace_.sqtt_disp_count_ >= max_sqtt_disp_) {
        Pal::Result res = EndRGPHardwareTrace(gpu);
        if (Pal::Result::ErrorIncompatibleQueue == res) {
          // continue until we find the right queue...
        }
        else if (Pal::Result::Success == res) {
          trace_.sqtt_disp_count_ = 0;
        } else {
          FinishRGPTrace(gpu, true);
        }
      }
    }

    if (IsQueueTimingActive()) {
      // Call TimedQueuePresent() to insert commands that collect GPU timestamp.
      Pal::IQueue* pPalQueue = gpu->queue(MainEngine).iQueue_;

      // Currently nothing in the PresentInfo struct is used for inserting a timed present marker.
      GpuUtil::TimedQueuePresentInfo timedPresentInfo = {};
      //Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo);
      //assert(result == Pal::Result::Success);
    }
  }
}

// ================================================================================================
Pal::Result RgpCaptureMgr::CheckForTraceResults()
{
  assert(trace_.status_ == TraceStatus::WaitingForResults);

  Pal::Result result = Pal::Result::NotReady;

  // Check if trace results are ready
  if (trace_.gpa_session_->IsReady() && // GPA session is ready
      (trace_.begin_queue_->isDone(&trace_.end_event_)))   // "Trace end" cmdbuf has retired
  {
    bool success = false;

    // Fetch required trace data size from GPA session
    size_t traceDataSize = 0;
    void* pTraceData     = nullptr;

    trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, nullptr);

    // Allocate memory for trace data
    if (traceDataSize > 0) {
        pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256);
    }

    if (pTraceData != nullptr) {
      // Get trace data from GPA session
      if (trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, pTraceData) ==
        Pal::Result::Success) {
        // Transmit trace data to anyone who's listening
        auto devResult = rgp_server_->WriteTraceData(
            static_cast<Pal::uint8*>(pTraceData), traceDataSize);

        success = (devResult == DevDriver::Result::Success);
      }

      amd::AlignedMemory::deallocate(pTraceData);
    }

    if (success) {
        result = Pal::Result::Success;
    }
  }

  return result;
}

// ================================================================================================
// Called after a swap chain presents.  This signals a (next) frame-begin boundary and is
// used to coordinate RGP trace start/stop.
void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
  size_t x, size_t y, size_t z)
{
  // Wait for the driver to be resumed in case it's been paused.
  WaitForDriverResume();

  if (rgp_server_->TracesEnabled()) {
    amd::ScopedLock traceLock(&trace_mutex_);

    // Check if there's an RGP trace request pending and we're idle
    if ((trace_.status_ == TraceStatus::Idle) && rgp_server_->IsTracePending()) {
      // Attempt to start preparing for a trace
      if (PrepareRGPTrace(gpu) == Pal::Result::Success) {
        // Attempt to start the trace immediately if we do not need to prepare
        if (num_prep_disp_ == 0) {
          if (BeginRGPTrace(gpu) != Pal::Result::Success) {
            FinishRGPTrace(gpu, true);
          }
        }
      }
    }
    else if (trace_.status_ == TraceStatus::Preparing) {
      // Wait some number of "preparation frames" before starting the trace in order to get enough
      // timer samples to sync CPU/GPU clock domains.
      trace_.prepared_disp_count_++;

      // Take a calibration timing measurement sample for this frame.
      trace_.gpa_session_->SampleTimingClocks();

      // Start the SQTT trace if we've waited a sufficient number of preparation frames
      if (trace_.prepared_disp_count_ >= num_prep_disp_) {
        Pal::Result result = BeginRGPTrace(gpu);

        if (Pal::Result::ErrorIncompatibleQueue == result) {
          // Let's wait until the app will reach the same queue
        } else if (result != Pal::Result::Success) {
          FinishRGPTrace(gpu, true);
        }
      }
    }
    // Check if we're ending a trace waiting for SQTT to turn off.
    // If SQTT has turned off, end the trace
    else if (trace_.status_ == TraceStatus::WaitingForSqtt) {
      Pal::Result result      = Pal::Result::Success;

      if (trace_.begin_queue_->isDone(&trace_.end_sqtt_event_)) {
        result = EndRGPTrace(gpu);
      } else {
        // todo: There is a wait inside the trace end for now
        result = EndRGPTrace(gpu);
      }

      if (result != Pal::Result::Success) {
        FinishRGPTrace(gpu, true);
      }
    }
    // Check if we're waiting for final trace results.
    else if (trace_.status_ == TraceStatus::WaitingForResults) {
      Pal::Result result = CheckForTraceResults();

      // Results ready: finish trace
      if (result == Pal::Result::Success) {
        FinishRGPTrace(gpu, false);
      }
      // Error while computing results: abort trace
      else if (result != Pal::Result::NotReady) {
        FinishRGPTrace(gpu, true);
      }
    }

    if (trace_.status_ == TraceStatus::Running) {
      RgpSqttMarkerEventType apiEvent = RgpSqttMarkerEventType::CmdNDRangeKernel;
      if (kernel.prog().isInternal()) {
        constexpr RgpSqttMarkerEventType ApiEvents[KernelBlitManager::BlitTotal] = {
          RgpSqttMarkerEventType::CmdCopyImage, RgpSqttMarkerEventType::CmdCopyImage,
          RgpSqttMarkerEventType::CmdCopyImageToBuffer,
          RgpSqttMarkerEventType::CmdCopyBufferToImage,
          RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer,
          RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer,
          RgpSqttMarkerEventType::CmdFillBuffer, RgpSqttMarkerEventType::CmdFillImage,
          RgpSqttMarkerEventType::CmdScheduler
        };
        for (uint i = 0; i < KernelBlitManager::BlitTotal; ++i) {
          if (kernel.name().compare(BlitName[i]) == 0) {
            apiEvent = ApiEvents[i];
            break;
          }
        }
      }
      WriteUserEventMarker(gpu, RgpSqttMarkerUserEventObjectName, kernel.name());
      // Write disaptch marker
      WriteEventWithDimsMarker(gpu, apiEvent,
        static_cast<uint32_t>(x), static_cast<uint32_t>(y), static_cast<uint32_t>(z));
    }
  }

  global_disp_count_++;
}

// ================================================================================================
// This function starts preparing for an RGP trace.  Preparation involves some N frames of
// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock domains.
//
// This function transitions from the Idle state to the Preparing state.
Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
{
  assert(trace_.status_ == TraceStatus::Idle);

  // We can only trace using a single device at a time currently, so recreate RGP trace
  // resources against this new one if the device is changing.
  Pal::Result result = Pal::Result::Success;

  const auto traceParameters = rgp_server_->QueryTraceParameters();

  num_prep_disp_   = traceParameters.captureStartIndex;
  uint32_t capture_disp = traceParameters.captureStopIndex - traceParameters.captureStartIndex;
  // Validate if the captured dispatches are in the range
  if ((capture_disp > 0) && (capture_disp < max_sqtt_disp_)) {
    max_sqtt_disp_ = capture_disp;
  }

  trace_gpu_mem_limit_  = traceParameters.gpuMemoryLimitInMb * 1024 * 1024;
  inst_tracing_enabled_ = traceParameters.flags.enableInstructionTokens;

  // Notify the RGP server that we are starting a trace
  if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
      result = Pal::Result::ErrorUnknown;
  }

  // Tell the GPA session class we're starting a trace
  if (result == Pal::Result::Success) {
    GpuUtil::GpaSessionBeginInfo info = {};

    info.flags.enableQueueTiming   = true;// trace_.queueTimingEnabled;

    result = trace_.gpa_session_->Begin(info);
  }

  trace_.prepared_disp_count_ = 0;
  trace_.sqtt_disp_count_     = 0;

  // Sample the timing clocks prior to starting a trace.
  if (result == Pal::Result::Success) {
    trace_.gpa_session_->SampleTimingClocks();
  }

  if (result == Pal::Result::Success) {
    // Remember which queue started the trace
    trace_.prepare_queue_ = gpu;
    trace_.begin_queue_   = nullptr;

    trace_.status_ = TraceStatus::Preparing;
  } else {
    // We failed to prepare for the trace so abort it.
    if (rgp_server_ != nullptr) {
      const DevDriver::Result devDriverResult = rgp_server_->AbortTrace();

      // AbortTrace should always succeed unless we've used the api incorrectly.
      assert(devDriverResult == DevDriver::Result::Success);
    }
  }

  return result;
}

// ================================================================================================
// This function begins an RGP trace by initializing all dependent resources and submitting
// the "begin trace" information command buffer.
//
// This function transitions from the Preparing state to the Running state.
Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
{
  assert(trace_.status_ == TraceStatus::Preparing);
  assert(trace_enabled_);

  // We can only trace using a single device at a time currently, so recreate RGP trace
  // resources against this new one if the device is changing.
  Pal::Result result = Pal::Result::Success;

  if (result == Pal::Result::Success) {
    // Only allow trace to start if the queue family at prep-time matches the queue
    // family at begin time because the command buffer engine type must match
    if (trace_.prepare_queue_ != gpu) {
      result = Pal::Result::ErrorIncompatibleQueue;
    }
  }

  // Start a GPA tracing sample with SQTT enabled
  if (result == Pal::Result::Success) {
    GpuUtil::GpaSampleConfig sampleConfig = {};

    sampleConfig.type = GpuUtil::GpaSampleType::Trace;
    sampleConfig.sqtt.gpuMemoryLimit = trace_gpu_mem_limit_;
    sampleConfig.sqtt.seMask = 0xF;
    sampleConfig.sqtt.flags.enable = true;
    sampleConfig.sqtt.flags.supressInstructionTokens = (inst_tracing_enabled_ == false);

    // Fill GPU commands
    gpu->eventBegin(MainEngine);
    trace_.gpa_sample_id_ = trace_.gpa_session_->BeginSample(
        gpu->queue(MainEngine).iCmd(), sampleConfig);
    gpu->eventEnd(MainEngine, trace_.begin_sqtt_event_);
  }

  // Submit the trace-begin command buffer
  if (result == Pal::Result::Success) {
    static constexpr bool NeedFlush = true;
    // Update the global GPU event
    gpu->setGpuEvent(trace_.begin_sqtt_event_, NeedFlush);
  }

  // Make the trace active and remember which queue started it
  if (result == Pal::Result::Success) {
    trace_.status_      = TraceStatus::Running;
    trace_.begin_queue_ = gpu;
  }

  return result;
}

// ================================================================================================
// This function submits the command buffer to stop SQTT tracing.  Full tracing still continues.
//
// This function transitions from the Running state to the WaitingForSqtt state.
Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu)
{
  assert(trace_.status_ == TraceStatus::Running);

  Pal::Result result = Pal::Result::Success;

  // Only allow SQTT trace to start and end on the same queue because it's critical that these are
  // in the same order
  if (gpu != trace_.begin_queue_) {
    result = Pal::Result::ErrorIncompatibleQueue;
  }

  // Tell the GPA session to insert any necessary commands to end the tracing sample and
  // end the session itself
  if (result == Pal::Result::Success) {
    assert(trace_.gpa_session_ != nullptr);

    // Write CB commands to finish the SQTT
    gpu->eventBegin(MainEngine);
    trace_.gpa_session_->EndSample(gpu->queue(MainEngine).iCmd(), trace_.gpa_sample_id_);
    gpu->eventEnd(MainEngine, trace_.end_sqtt_event_);

    static constexpr bool NeedFlush = true;
    // Update the global GPU event
    gpu->setGpuEvent(trace_.end_sqtt_event_, NeedFlush);

    trace_.status_ = TraceStatus::WaitingForSqtt;

    // Execute a device wait idle
    if (device_.settings().rgpSqttWaitIdle_) {
      // Make sure the trace is done. Note: required for SDMA data write back
      gpu->waitForEvent(&trace_.end_sqtt_event_);
    }
  }

  return result;
}

// ================================================================================================
// This function ends a running RGP trace.
//
// This function transitions from the WaitingForSqtt state to WaitingForResults state.
Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu)
{
  assert(trace_.status_ == TraceStatus::WaitingForSqtt);

  Pal::Result result = Pal::Result::Success;

  // Tell the GPA session to insert any necessary commands to end the tracing sample and
  // end the session itself
  if (result == Pal::Result::Success) {
    assert(trace_.gpa_session_ != nullptr);
    // Initiate SDMA copy
    gpu->eventBegin(SdmaEngine);
    result = trace_.gpa_session_->End(gpu->queue(SdmaEngine).iCmd());
    gpu->eventEnd(SdmaEngine, trace_.end_event_);
  }

  // Submit the trace-end command buffer
  if (result == Pal::Result::Success) {
    static constexpr bool NeedFlush = true;
    // Update the global GPU event
    gpu->setGpuEvent(trace_.end_event_, NeedFlush);

    trace_.status_ = TraceStatus::WaitingForResults;

    if (device_.settings().rgpSqttWaitIdle_) {
      // Make sure the transfer is done
      gpu->waitForEvent(&trace_.end_event_);
    }
  }

  return result;
}

// ================================================================================================
// This function resets and possibly cancels a currently active (between begin/end) RGP trace.
// It frees any dependent resources.
void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted)
{
  if (trace_.prepare_queue_ == nullptr) {
    return;
  }

  // Finish the trace if the queue was destroyed before
  // OCL reached the number of captured dispatches
  if ((trace_.sqtt_disp_count_ != 0) && (gpu != nullptr)) {
    EndRGPHardwareTrace(gpu);
  }

  // Inform RGP protocol that we're done with the trace, either by aborting it or finishing normally
  if (aborted) {
    rgp_server_->AbortTrace();
  } else {
    rgp_server_->EndTrace();
  }

  if (trace_.gpa_session_ != nullptr) {
    trace_.gpa_session_->Reset();
  }

  // Reset tracing state to idle
  trace_.prepared_disp_count_ = 0;
  trace_.sqtt_disp_count_     = 0;
  trace_.gpa_sample_id_       = 0;
  trace_.status_              = TraceStatus::Idle;
  trace_.prepare_queue_       = nullptr;
  trace_.begin_queue_         = nullptr;
}

// ================================================================================================
// Destroys device-persistent RGP resources
void RgpCaptureMgr::DestroyRGPTracing()
{
  if (trace_.status_ != TraceStatus::Idle) {
   FinishRGPTrace(nullptr, true);
  }

  delete user_event_;

  // Destroy the GPA session
  if (trace_.gpa_session_ != nullptr) {
    //Util::Destructor(trace_.gpa_session_);
    delete trace_.gpa_session_;
    trace_.gpa_session_ = nullptr;
  }

  memset(&trace_, 0, sizeof(trace_));
}

// ================================================================================================
// Returns true if the given device properties/settings support tracing.
bool RgpCaptureMgr::GpuSupportsTracing(
    const Pal::DeviceProperties& props,
    const Settings&       settings)
{
  return props.gfxipProperties.flags.supportRgpTraces && !settings.rgpSqttForceDisable_;
}

// ================================================================================================
// Called when a new device is created.  This will preallocate reusable RGP trace resources
// for that device.
void RgpCaptureMgr::PostDeviceCreate()
{
  amd::ScopedLock traceLock(&trace_mutex_);

  auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();

  assert(pDriverControlServer != nullptr);

  // If the driver hasn't been marked as fully initialized yet, mark it now.
  // We consider the time after the logical device creation to be the fully initialized driver
  // position. This is mainly because PAL is fully initialized at this point and we also know
  // whether or not the debug vmid has been acquired. External tools use this information to
  // decide when it's reasonable to make certain requests of the driver through protocol functions.
  if (pDriverControlServer->IsDriverInitialized() == false) {
    pDriverControlServer->FinishDriverInitialization();
  }
}

// ================================================================================================
// Called prior to a device's being destroyed.  This will free persistent RGP trace resources for
// that device.
void RgpCaptureMgr::PreDeviceDestroy()
{
  amd::ScopedLock traceLock(&trace_mutex_);
  // If we are idle, we can re-initialize trace resources based on the new device.
  if (trace_.status_ == TraceStatus::Idle) {
    DestroyRGPTracing();
  }
}

// ================================================================================================
// Sets up an Event marker's basic data.
RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(
  const VirtualGPU* gpu, RgpSqttMarkerEventType api_type) const
{
  RgpSqttMarkerEvent marker = {};

  marker.identifier = RgpSqttMarkerIdentifierEvent;
  marker.apiType = static_cast<uint32_t>(api_type);
  marker.cmdID = trace_.current_event_id_++;
  marker.cbID = gpu->queue(MainEngine).cmdBufId();

  return marker;
}

// ================================================================================================
void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const
{
  assert((data_size % sizeof(uint32_t)) == 0);
  assert((data_size / sizeof(uint32_t)) > 0);

  gpu->queue(MainEngine).iCmd()->CmdInsertRgpTraceMarker(
    static_cast<uint32_t>(data_size / sizeof(uint32_t)), data);
}

// ================================================================================================
// Inserts an RGP pre-dispatch marker
void RgpCaptureMgr::WriteEventWithDimsMarker(
  const VirtualGPU*      gpu,
  RgpSqttMarkerEventType apiType,
  uint32_t               x,
  uint32_t               y,
  uint32_t               z) const
{
  assert(apiType != RgpSqttMarkerEventType::Invalid);

  RgpSqttMarkerEventWithDims eventWithDims = {};

  eventWithDims.event = BuildEventMarker(gpu, apiType);
  eventWithDims.event.hasThreadDims = 1;
  eventWithDims.threadX = x;
  eventWithDims.threadY = y;
  eventWithDims.threadZ = z;

  WriteMarker(gpu, &eventWithDims, sizeof(eventWithDims));
}

// ================================================================================================
void RgpCaptureMgr::WriteBarrierStartMarker(
  const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const
{
  if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
    amd::ScopedLock traceLock(&trace_mutex_);
    RgpSqttMarkerBarrierStart marker = {};

    marker.identifier = RgpSqttMarkerIdentifierBarrierStart;
    marker.cbId       = trace_.begin_queue_->queue(MainEngine).cmdBufId();
    marker.dword02    = data.reason;
    marker.internal   = true;

    WriteMarker(gpu, &marker, sizeof(marker));
  }
}

// ================================================================================================
void RgpCaptureMgr::WriteBarrierEndMarker(
  const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const
{
  if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
    amd::ScopedLock traceLock(&trace_mutex_);
    // Copy the operations part and include the same data from previous markers
    // within the same barrier sequence to create a full picture of all cache
    // syncs and pipeline stalls.
    auto operations = data.operations;

    operations.pipelineStalls.u16All |= 0;
    operations.caches.u16All         |= 0;

    RgpSqttMarkerBarrierEnd marker = {};

    marker.identifier           = RgpSqttMarkerIdentifierBarrierEnd;
    marker.cbId                 = trace_.begin_queue_->queue(MainEngine).cmdBufId();

    marker.waitOnEopTs          = operations.pipelineStalls.waitOnEopTsBottomOfPipe;
    marker.vsPartialFlush       = operations.pipelineStalls.vsPartialFlush;
    marker.psPartialFlush       = operations.pipelineStalls.psPartialFlush;
    marker.csPartialFlush       = operations.pipelineStalls.csPartialFlush;
    marker.pfpSyncMe            = operations.pipelineStalls.pfpSyncMe;
    marker.syncCpDma            = operations.pipelineStalls.syncCpDma;
    marker.invalTcp             = operations.caches.invalTcp;
    marker.invalSqI             = operations.caches.invalSqI$;
    marker.invalSqK             = operations.caches.invalSqK$;
    marker.flushTcc             = operations.caches.flushTcc;
    marker.invalTcc             = operations.caches.invalTcc;
    marker.flushCb              = operations.caches.flushCb;
    marker.invalCb              = operations.caches.invalCb;
    marker.flushDb              = operations.caches.flushDb;
    marker.invalDb              = operations.caches.invalDb;

    marker.numLayoutTransitions = 0;

    WriteMarker(gpu, &marker, sizeof(marker));
  }
}

// ================================================================================================
// Inserts a user event string marker
void RgpCaptureMgr::WriteUserEventMarker(
  const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType, const std::string& name) const
{
  memset(user_event_, 0, sizeof(RgpSqttMarkerUserEventWithString));

  user_event_->header.identifier = RgpSqttMarkerIdentifierUserEvent;
  user_event_->header.dataType = eventType;

  size_t markerSize = sizeof(user_event_->header);

  if ((eventType != RgpSqttMarkerUserEventPop)) {
    size_t strLength = std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t));
    for (uint32_t charIdx = 0; charIdx < strLength; ++charIdx) {
      uint32_t c = static_cast<uint32_t>(name[charIdx]);
      user_event_->stringData[charIdx / 4] |= (c << (8 * (charIdx % 4)));
      user_event_->stringLength = static_cast<uint32_t>(strLength);
    }

    // Every data type other than Pop includes a string length
    markerSize += sizeof(uint32_t);

    // Include string length (padded up to the nearest dword)
    markerSize += sizeof(uint32_t) * ((strLength + sizeof(uint32_t) - 1) / sizeof(uint32_t));
  }

  WriteMarker(gpu, user_event_, markerSize);
}


}; // namespace vk