P4 to Git Change 1619397 by gandryey@gera-w8 on 2018/10/15 18:23:10

SWDEV-155434 - Add SQTT instrumentation tokens for OpenCL dispatches for RGP support - Add the barrier's reason to the RGP trace Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#7 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#127 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#57 edit [ROCm/clr commit: 9bad9b4d54]
2018-10-18 07:34:23 -04:00
@@ -771,7 +771,7 @@ void RgpCaptureMgr::WriteBarrierStartMarker(

    marker.identifier = RgpSqttMarkerIdentifierBarrierStart;
    marker.cbId       = trace_.begin_queue_->queue(MainEngine).cmdBufId();
-    marker.dword02    = 0xFFFFFFFF; //data.reason;
+    marker.dword02    = data.reason;
    marker.internal   = true;

    WriteMarker(gpu, &marker, sizeof(marker));
@@ -113,6 +113,17 @@ enum class RgpSqttMarkerEventType : uint32_t
  Invalid = 0xffffffff
 };

+// ================================================================================================
+enum class RgpSqqtBarrierReason : uint32_t
+{
+  Invalid = 0,
+  MemDependency = 0xC0000000,
+  ProfilingControl = 0xC0000001,
+  SignalSubmit = 0xC0000002,
+  PostDeviceEnqueue = 0xC0000003,
+  Unknown = 0xffffffff
+};
+
 // ================================================================================================
 // RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.  
 // These are generated ahead of draws or dispatches for commands that trigger generation of waves
@@ -443,7 +443,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor

  if (maxMemObjectsInQueue_ == 0) {
    // Flush cache
-    gpu.addBarrier();
+    gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
    return;
  }

@@ -484,7 +484,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
  if (flushL1Cache) {
    // Flush cache
    if (!gpu.profiling()) {
-      gpu.addBarrier();
+      gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
    }
    // Clear memory dependency state
    const static bool All = true;
@@ -2036,7 +2036,7 @@ void VirtualGPU::PostDeviceEnqueue(
    .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
      gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
  const static bool FlushL2 = true;
-  gpuDefQueue->addBarrier(FlushL2);
+  gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2);

  // Get the address of PM4 template and add write it to params
  //! @note DMA flush must not occur between patch and the scheduler
@@ -2272,7 +2272,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
    // Note: This a workaround for incorrect results reported with release_mem packet,
    // when the packet can be processed later after this dispatch and including extra time
    if (profiling() || state_.profileEnabled_) {
-      addBarrier();
+      addBarrier(RgpSqqtBarrierReason::ProfilingControl);
      // Clear memory dependency to avoid the second L1 invalidation
      memoryDependency().clear();
    }
@@ -2583,7 +2583,7 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {

    // Make sure GPU finished operation and data reached memory before the marker write
    static constexpr bool FlushL2 = true;
-    addBarrier(FlushL2);
+    addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2);
    // Workarounds: We had systems where an extra delay was necessary.
    {
        // Flush CB associated with the DGMA buffer
@@ -3029,7 +3029,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
        if (!supportFineGrainedSystem) {
          return false;
        } else {
-          addBarrier();
+          addBarrier(RgpSqqtBarrierReason::MemDependency);
          // Clear memory dependency state
          const static bool All = true;
          memoryDependency().clear(!All);
@@ -3126,7 +3126,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
        }
        //! This condition is for SVM fine-grain
        if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
-          addBarrier();
+          addBarrier(RgpSqqtBarrierReason::MemDependency);
          // Clear memory dependency state
          const static bool All = true;
          memoryDependency().clear(!All);
@@ -10,6 +10,7 @@
 #include "device/pal/paltimestamp.hpp"
 #include "device/pal/palsched.hpp"
 #include "device/pal/paldebugger.hpp"
+#include "device/pal/palgpuopen.hpp"
 #include "platform/commandqueue.hpp"
 #include "device/blit.hpp"
 #include "palUtil.h"
@@ -445,7 +446,7 @@ class VirtualGPU : public device::VirtualDevice {
  //! Returns queue, associated with VirtualGPU
  Queue& queue(EngineType id) const { return *queues_[id]; }

-  void addBarrier(bool flushL2 = false) const {
+  void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown, bool flushL2 = false) const {
    Pal::BarrierInfo barrier = {};
    barrier.pipePointWaitCount = 1;
    Pal::HwPipePoint point = Pal::HwPipePostCs;
@@ -460,6 +461,7 @@ class VirtualGPU : public device::VirtualDevice {
                                     Pal::LayoutShaderRead}};
    barrier.pTransitions = &trans;
    barrier.waitPoint = Pal::HwPipePreCs;
+    barrier.reason = static_cast<uint32_t>(reason);
    iCmd()->CmdBarrier(barrier);
    queues_[engineID_]->submit<true>(false);
  }