From 07cd36bbd0365eb6961706eae35007ce30629438 Mon Sep 17 00:00:00 2001 From: foreman Date: Thu, 18 Oct 2018 07:34:23 -0400 Subject: [PATCH] P4 to Git Change 1619397 by gandryey@gera-w8 on 2018/10/15 18:23:10 SWDEV-155434 - Add SQTT instrumentation tokens for OpenCL dispatches for RGP support - Add the barrier's reason to the RGP trace Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#7 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#127 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#57 edit [ROCm/clr commit: 9bad9b4d5495911e0337b3d57aa022dcf0306926] --- .../clr/rocclr/runtime/device/pal/palgpuopen.cpp | 2 +- .../clr/rocclr/runtime/device/pal/palgpuopen.hpp | 11 +++++++++++ .../clr/rocclr/runtime/device/pal/palvirtual.cpp | 14 +++++++------- .../clr/rocclr/runtime/device/pal/palvirtual.hpp | 4 +++- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp index 86980f90ee..99ce6a5cca 100644 --- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp @@ -771,7 +771,7 @@ void RgpCaptureMgr::WriteBarrierStartMarker( marker.identifier = RgpSqttMarkerIdentifierBarrierStart; marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId(); - marker.dword02 = 0xFFFFFFFF; //data.reason; + marker.dword02 = data.reason; marker.internal = true; WriteMarker(gpu, &marker, sizeof(marker)); diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp index 7fec0063bf..3bcb70679d 100644 --- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp @@ -113,6 +113,17 @@ enum class RgpSqttMarkerEventType : uint32_t Invalid = 0xffffffff }; +// ================================================================================================ +enum class RgpSqqtBarrierReason : uint32_t +{ + Invalid = 0, + MemDependency = 0xC0000000, + ProfilingControl = 0xC0000001, + SignalSubmit = 0xC0000002, + PostDeviceEnqueue = 0xC0000003, + Unknown = 0xffffffff +}; + // ================================================================================================ // RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker. // These are generated ahead of draws or dispatches for commands that trigger generation of waves diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 0ba6778f8c..22c4f5eb9f 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -443,7 +443,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor if (maxMemObjectsInQueue_ == 0) { // Flush cache - gpu.addBarrier(); + gpu.addBarrier(RgpSqqtBarrierReason::MemDependency); return; } @@ -484,7 +484,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor if (flushL1Cache) { // Flush cache if (!gpu.profiling()) { - gpu.addBarrier(); + gpu.addBarrier(RgpSqqtBarrierReason::MemDependency); } // Clear memory dependency state const static bool All = true; @@ -2036,7 +2036,7 @@ void VirtualGPU::PostDeviceEnqueue( .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0, gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); const static bool FlushL2 = true; - gpuDefQueue->addBarrier(FlushL2); + gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2); // Get the address of PM4 template and add write it to params //! @note DMA flush must not occur between patch and the scheduler @@ -2272,7 +2272,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const // Note: This a workaround for incorrect results reported with release_mem packet, // when the packet can be processed later after this dispatch and including extra time if (profiling() || state_.profileEnabled_) { - addBarrier(); + addBarrier(RgpSqqtBarrierReason::ProfilingControl); // Clear memory dependency to avoid the second L1 invalidation memoryDependency().clear(); } @@ -2583,7 +2583,7 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) { // Make sure GPU finished operation and data reached memory before the marker write static constexpr bool FlushL2 = true; - addBarrier(FlushL2); + addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2); // Workarounds: We had systems where an extra delay was necessary. { // Flush CB associated with the DGMA buffer @@ -3029,7 +3029,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p if (!supportFineGrainedSystem) { return false; } else { - addBarrier(); + addBarrier(RgpSqqtBarrierReason::MemDependency); // Clear memory dependency state const static bool All = true; memoryDependency().clear(!All); @@ -3126,7 +3126,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p } //! This condition is for SVM fine-grain if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) { - addBarrier(); + addBarrier(RgpSqqtBarrierReason::MemDependency); // Clear memory dependency state const static bool All = true; memoryDependency().clear(!All); diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index 4692d4a28e..a7a6b38ee7 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -10,6 +10,7 @@ #include "device/pal/paltimestamp.hpp" #include "device/pal/palsched.hpp" #include "device/pal/paldebugger.hpp" +#include "device/pal/palgpuopen.hpp" #include "platform/commandqueue.hpp" #include "device/blit.hpp" #include "palUtil.h" @@ -445,7 +446,7 @@ class VirtualGPU : public device::VirtualDevice { //! Returns queue, associated with VirtualGPU Queue& queue(EngineType id) const { return *queues_[id]; } - void addBarrier(bool flushL2 = false) const { + void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown, bool flushL2 = false) const { Pal::BarrierInfo barrier = {}; barrier.pipePointWaitCount = 1; Pal::HwPipePoint point = Pal::HwPipePostCs; @@ -460,6 +461,7 @@ class VirtualGPU : public device::VirtualDevice { Pal::LayoutShaderRead}}; barrier.pTransitions = &trans; barrier.waitPoint = Pal::HwPipePreCs; + barrier.reason = static_cast(reason); iCmd()->CmdBarrier(barrier); queues_[engineID_]->submit(false); }