From 07cd36bbd0365eb6961706eae35007ce30629438 Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 18 Oct 2018 07:34:23 -0400
Subject: [PATCH] P4 to Git Change 1619397 by gandryey@gera-w8 on 2018/10/15
18:23:10
SWDEV-155434 - Add SQTT instrumentation tokens for OpenCL dispatches for RGP support
- Add the barrier's reason to the RGP trace
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#127 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#57 edit
[ROCm/clr commit: 9bad9b4d5495911e0337b3d57aa022dcf0306926]
---
.../clr/rocclr/runtime/device/pal/palgpuopen.cpp | 2 +-
.../clr/rocclr/runtime/device/pal/palgpuopen.hpp | 11 +++++++++++
.../clr/rocclr/runtime/device/pal/palvirtual.cpp | 14 +++++++-------
.../clr/rocclr/runtime/device/pal/palvirtual.hpp | 4 +++-
4 files changed, 22 insertions(+), 9 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp
index 86980f90ee..99ce6a5cca 100644
--- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp
@@ -771,7 +771,7 @@ void RgpCaptureMgr::WriteBarrierStartMarker(
marker.identifier = RgpSqttMarkerIdentifierBarrierStart;
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
- marker.dword02 = 0xFFFFFFFF; //data.reason;
+ marker.dword02 = data.reason;
marker.internal = true;
WriteMarker(gpu, &marker, sizeof(marker));
diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp
index 7fec0063bf..3bcb70679d 100644
--- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp
@@ -113,6 +113,17 @@ enum class RgpSqttMarkerEventType : uint32_t
Invalid = 0xffffffff
};
+// ================================================================================================
+enum class RgpSqqtBarrierReason : uint32_t
+{
+ Invalid = 0,
+ MemDependency = 0xC0000000,
+ ProfilingControl = 0xC0000001,
+ SignalSubmit = 0xC0000002,
+ PostDeviceEnqueue = 0xC0000003,
+ Unknown = 0xffffffff
+};
+
// ================================================================================================
// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.
// These are generated ahead of draws or dispatches for commands that trigger generation of waves
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 0ba6778f8c..22c4f5eb9f 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -443,7 +443,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
if (maxMemObjectsInQueue_ == 0) {
// Flush cache
- gpu.addBarrier();
+ gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
return;
}
@@ -484,7 +484,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
if (flushL1Cache) {
// Flush cache
if (!gpu.profiling()) {
- gpu.addBarrier();
+ gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
}
// Clear memory dependency state
const static bool All = true;
@@ -2036,7 +2036,7 @@ void VirtualGPU::PostDeviceEnqueue(
.runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
const static bool FlushL2 = true;
- gpuDefQueue->addBarrier(FlushL2);
+ gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2);
// Get the address of PM4 template and add write it to params
//! @note DMA flush must not occur between patch and the scheduler
@@ -2272,7 +2272,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
// Note: This a workaround for incorrect results reported with release_mem packet,
// when the packet can be processed later after this dispatch and including extra time
if (profiling() || state_.profileEnabled_) {
- addBarrier();
+ addBarrier(RgpSqqtBarrierReason::ProfilingControl);
// Clear memory dependency to avoid the second L1 invalidation
memoryDependency().clear();
}
@@ -2583,7 +2583,7 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
// Make sure GPU finished operation and data reached memory before the marker write
static constexpr bool FlushL2 = true;
- addBarrier(FlushL2);
+ addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2);
// Workarounds: We had systems where an extra delay was necessary.
{
// Flush CB associated with the DGMA buffer
@@ -3029,7 +3029,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
if (!supportFineGrainedSystem) {
return false;
} else {
- addBarrier();
+ addBarrier(RgpSqqtBarrierReason::MemDependency);
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
@@ -3126,7 +3126,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
}
//! This condition is for SVM fine-grain
if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
- addBarrier();
+ addBarrier(RgpSqqtBarrierReason::MemDependency);
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index 4692d4a28e..a7a6b38ee7 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -10,6 +10,7 @@
#include "device/pal/paltimestamp.hpp"
#include "device/pal/palsched.hpp"
#include "device/pal/paldebugger.hpp"
+#include "device/pal/palgpuopen.hpp"
#include "platform/commandqueue.hpp"
#include "device/blit.hpp"
#include "palUtil.h"
@@ -445,7 +446,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Returns queue, associated with VirtualGPU
Queue& queue(EngineType id) const { return *queues_[id]; }
- void addBarrier(bool flushL2 = false) const {
+ void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown, bool flushL2 = false) const {
Pal::BarrierInfo barrier = {};
barrier.pipePointWaitCount = 1;
Pal::HwPipePoint point = Pal::HwPipePostCs;
@@ -460,6 +461,7 @@ class VirtualGPU : public device::VirtualDevice {
Pal::LayoutShaderRead}};
barrier.pTransitions = &trans;
barrier.waitPoint = Pal::HwPipePreCs;
+ barrier.reason = static_cast(reason);
iCmd()->CmdBarrier(barrier);
queues_[engineID_]->submit(false);
}