P4 to Git Change 1579667 by gandryey@gera-w8 on 2018/07/12 12:31:33

SWDEV-155438 - Produce RGP Queue Timings chunk for OpenCL RGP files - Collect command buffer timing - Capture 50 dispatches by default Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#113 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#54 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#291 edit [ROCm/clr commit: 96d0ddb728]
2018-07-12 12:40:04 -04:00
Commit 705bd72c1a
@@ -122,20 +122,11 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
    result = false;
  }

-  // Initialize trace resources required by each queue (and queue family)
-  bool hasDebugVmid = true;
-
  if (result) {
    user_event_ = new RgpSqttMarkerUserEventWithString;
    if (nullptr == user_event_) {
      result = false;
    }
-    //result = InitTraceQueueResources(trace_, &hasDebugVmid);
-  }
-
-  // If we've failed to acquire the debug VMID, fail to trace
-  if (hasDebugVmid == false) {
-    result = false;
  }

  if (!result) {
@@ -155,6 +146,59 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
  return result;
 }

+// ================================================================================================
+// This function finds out all the queues in the device that we have to synchronize for RGP-traced
+// frames and initializes resources for them.
+bool RgpCaptureMgr::RegisterTimedQueue(VirtualGPU* gpu, bool* debug_vmid) const
+{
+  bool result = true;
+
+  // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
+  // it may be optional for Vulkan, but we provide it anyway if available).
+  Pal::KernelContextInfo kernelContextInfo = {};
+
+  Pal::Result palResult = gpu->queue(MainEngine).iQueue_->QueryKernelContextInfo(&kernelContextInfo);
+
+  // Ensure we've acquired the debug VMID (note that some platforms do not
+  // implement this function, so don't fail the whole trace if so)
+  *debug_vmid = kernelContextInfo.flags.hasDebugVmid;
+
+  // Register the queue with the GPA session class for timed queue operation support.
+  if (trace_.gpa_session_->RegisterTimedQueue(gpu->queue(MainEngine).iQueue_, gpu->index(),
+      kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
+    result = false;
+  }
+
+  return result;
+}
+
+// ================================================================================================
+Pal::Result RgpCaptureMgr::TimedQueueSubmit(
+  Pal::IQueue*  queue,
+  uint64_t      cmdId,
+  const Pal::SubmitInfo& submitInfo) const
+{
+  // Fill in extra meta-data information to associate the API command buffer data with
+  // the generated timing information.
+  GpuUtil::TimedSubmitInfo timedSubmitInfo = {};
+  Pal::uint64 apiCmdBufIds = cmdId;
+  Pal::uint32 sqttCmdBufIds = 0;
+
+  timedSubmitInfo.pApiCmdBufIds = &apiCmdBufIds;
+  timedSubmitInfo.pSqttCmdBufIds = &sqttCmdBufIds;
+  timedSubmitInfo.frameIndex = 0;
+
+  // Do a timed submit of all the command buffers
+  Pal::Result result = trace_.gpa_session_->TimedSubmit(queue, submitInfo, timedSubmitInfo);
+
+  // Punt to non-timed submit if a timed submit fails (or is not supported)
+  if (result != Pal::Result::Success) {
+    result = queue->Submit(submitInfo);
+  }
+
+  return result;
+}
+
 // ================================================================================================
 // Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit().
 //
@@ -299,8 +299,8 @@ public:

  void Finalize();

-  void PreDispatch(VirtualGPU* pQueue, const HSAILKernel& kernel, size_t x, size_t y, size_t z);
-  void PostDispatch(VirtualGPU* pQueue);
+  void PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size_t x, size_t y, size_t z);
+  void PostDispatch(VirtualGPU* gpu);

  void WaitForDriverResume();

@@ -312,6 +312,9 @@ public:

  void WriteBarrierStartMarker(const Pal::Developer::BarrierData& data) const;
  void WriteBarrierEndMarker(const Pal::Developer::BarrierData& data) const;
+  bool RegisterTimedQueue(VirtualGPU* gpu, bool* debug_vmid) const;
+  Pal::Result TimedQueueSubmit(
+    Pal::IQueue* queue, uint64_t cmdId, const Pal::SubmitInfo& submitInfo) const;

 private:
  // Steps that an RGP trace goes through
@@ -33,10 +33,11 @@

 namespace pal {

-VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueType queueType,
+VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueType queueType,
                                             uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
                                             uint rtCU, amd::CommandQueue::Priority priority,
                                             uint64_t residency_limit, uint max_command_buffers) {
+  Pal::IDevice* palDev = gpu.dev().iDev();
  Pal::Result result;
  Pal::CmdBufferCreateInfo cmdCreateInfo = {};
  Pal::QueueCreateInfo qCreateInfo = {};
@@ -82,7 +83,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp
  }

  size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
-  VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit, max_command_buffers);
+  VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(gpu, palDev,
+    residency_limit, max_command_buffers);
  if (queue != nullptr) {
    address addrQ = reinterpret_cast<address>(&queue[1]);
    // Create PAL queue object
@@ -250,7 +252,14 @@ bool VirtualGPU::Queue::flush() {
  submitInfo.ppExternPhysMem = palSdiRefs_.data();

  // Submit command buffer to OS
-  if (Pal::Result::Success != iQueue_->Submit(submitInfo)) {
+  Pal::Result result;
+  if (gpu_.rgpCaptureEna()) {
+    result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(
+      iQueue_, cmdBufIdCurrent_, submitInfo);
+  } else {
+    result = iQueue_->Submit(submitInfo);
+  }
+  if (Pal::Result::Success != result) {
    LogError("PAL failed to submit CMD!");
    return false;
  }
@@ -787,7 +796,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
    // hwRing_ should be set 0 if forced to have single scratch buffer
    hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;

-    queues_[MainEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue,
+    queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue,
                                        cmdAllocator_, rtCUs, priority,
                                        residency_limit, max_cmd_buffers);
    if (nullptr == queues_[MainEngine]) {
@@ -805,14 +814,14 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
      }

      queues_[SdmaEngine] =
-          Queue::Create(dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_,
+          Queue::Create(*this, Pal::QueueTypeDma, sdma, cmdAllocator_,
                        amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal,
                        residency_limit, max_cmd_buffers);
      if (nullptr == queues_[SdmaEngine]) {
        return false;
      }
    } else {
-        queues_[SdmaEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute,
+        queues_[SdmaEngine] = Queue::Create(*this, Pal::QueueTypeCompute,
            idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal,
            residency_limit, max_cmd_buffers);
        if (nullptr == queues_[SdmaEngine]) {
@@ -890,7 +899,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
  // If the developer mode manager is available and it's not a device queue,
  // then enable RGP capturing
  if ((index() != 0) && dev().rgpCaptureMgr() != nullptr) {
+    bool dbg_vmid = false;
    state_.rgpCaptureEnabled_ = true;
+    dev().rgpCaptureMgr()->RegisterTimedQueue(this, &dbg_vmid);
  }

  return true;
@@ -50,7 +50,7 @@ class VirtualGPU : public device::VirtualDevice {
    Queue(const Queue&) = delete;
    Queue& operator=(const Queue&) = delete;

-    static Queue* Create(Pal::IDevice* palDev,                 //!< PAL device object
+    static Queue* Create(const VirtualGPU& gpu,                //!< OCL virtual GPU object
                         Pal::QueueType queueType,             //!< PAL queue type
                         uint engineIdx,                       //!< Select particular engine index
                         Pal::ICmdAllocator* cmdAlloc,         //!< PAL CMD buffer allocator
@@ -60,12 +60,13 @@ class VirtualGPU : public device::VirtualDevice {
                         uint max_command_buffers              //!< Number of allocated command buffers
                         );

-    Queue(Pal::IDevice* palDev, uint64_t residency_limit, uint max_command_buffers)
+    Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers)
        : iQueue_(nullptr),
          iCmdBuffs_(max_command_buffers, nullptr),
          iCmdFences_(max_command_buffers, nullptr),
          last_kernel_(nullptr),
-          iDev_(palDev),
+          gpu_(gpu),
+          iDev_(iDev),
          cmdBufIdSlot_(StartCmdBufIdx),
          cmdBufIdCurrent_(StartCmdBufIdx),
          cmbBufIdRetired_(0),
@@ -156,16 +157,17 @@ class VirtualGPU : public device::VirtualDevice {

  private:
    void DumpMemoryReferences() const;
+    const VirtualGPU& gpu_; //!< OCL virtual GPU object
    Pal::IDevice* iDev_;    //!< PAL device
    uint cmdBufIdSlot_;     //!< Command buffer ID slot for submissions
    uint cmdBufIdCurrent_;  //!< Current global command buffer ID
    uint cmbBufIdRetired_;  //!< The last retired command buffer ID
    uint cmdCnt_;           //!< Counter of commands
    std::unordered_map<GpuMemoryReference*, uint> memReferences_;
-    Util::VirtualLinearAllocator vlAlloc_;
-    std::vector<Pal::GpuMemoryRef> palMemRefs_;
-    std::vector<Pal::IGpuMemory*> palMems_;
-    std::vector<Pal::DoppRef> palDoppRefs_;
+    Util::VirtualLinearAllocator    vlAlloc_;
+    std::vector<Pal::GpuMemoryRef>  palMemRefs_;
+    std::vector<Pal::IGpuMemory*>   palMems_;
+    std::vector<Pal::DoppRef>       palDoppRefs_;
    std::set<Pal::IGpuMemory*>      sdiReferences_;
    std::vector<const Pal::IGpuMemory*>   palSdiRefs_;
    uint64_t  residency_size_;  //!< Resource residency size
@@ -197,7 +197,7 @@ release(bool, GPU_VEGA10_ONLY, VEGA10_ONLY,                                   \
        "1 = Report vega10 only on OCL/ROCR")                                 \
 release_on_stg(bool, PAL_DISABLE_SDMA, false,                                 \
        "1 = Disable SDMA for PAL")                                           \
-release(uint, PAL_RGP_DISP_COUNT, 10,                                         \
+release(uint, PAL_RGP_DISP_COUNT, 50,                                         \
        "The number of dispatches for RGP capture with SQTT")                 \
 release(bool, GPU_FORCE_WAVE_SIZE_32, false,                                  \
        "Forces WaveSize32 compilation in SC")                                \