P4 to Git Change 1619397 by gandryey@gera-w8 on 2018/10/15 18:23:10

SWDEV-155434 - Add SQTT instrumentation tokens for OpenCL dispatches for RGP support
	- Add the barrier's reason to the RGP trace

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#127 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#57 edit


[ROCm/clr commit: 9bad9b4d54]
Tento commit je obsažen v:
foreman
2018-10-18 07:34:23 -04:00
rodič 1f6a902aaa
revize 07cd36bbd0
4 změnil soubory, kde provedl 22 přidání a 9 odebrání
+1 -1
Zobrazit soubor
@@ -771,7 +771,7 @@ void RgpCaptureMgr::WriteBarrierStartMarker(
marker.identifier = RgpSqttMarkerIdentifierBarrierStart;
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
marker.dword02 = 0xFFFFFFFF; //data.reason;
marker.dword02 = data.reason;
marker.internal = true;
WriteMarker(gpu, &marker, sizeof(marker));
+11
Zobrazit soubor
@@ -113,6 +113,17 @@ enum class RgpSqttMarkerEventType : uint32_t
Invalid = 0xffffffff
};
// ================================================================================================
enum class RgpSqqtBarrierReason : uint32_t
{
Invalid = 0,
MemDependency = 0xC0000000,
ProfilingControl = 0xC0000001,
SignalSubmit = 0xC0000002,
PostDeviceEnqueue = 0xC0000003,
Unknown = 0xffffffff
};
// ================================================================================================
// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.
// These are generated ahead of draws or dispatches for commands that trigger generation of waves
+7 -7
Zobrazit soubor
@@ -443,7 +443,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
if (maxMemObjectsInQueue_ == 0) {
// Flush cache
gpu.addBarrier();
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
return;
}
@@ -484,7 +484,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
if (flushL1Cache) {
// Flush cache
if (!gpu.profiling()) {
gpu.addBarrier();
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
}
// Clear memory dependency state
const static bool All = true;
@@ -2036,7 +2036,7 @@ void VirtualGPU::PostDeviceEnqueue(
.runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
const static bool FlushL2 = true;
gpuDefQueue->addBarrier(FlushL2);
gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2);
// Get the address of PM4 template and add write it to params
//! @note DMA flush must not occur between patch and the scheduler
@@ -2272,7 +2272,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
// Note: This a workaround for incorrect results reported with release_mem packet,
// when the packet can be processed later after this dispatch and including extra time
if (profiling() || state_.profileEnabled_) {
addBarrier();
addBarrier(RgpSqqtBarrierReason::ProfilingControl);
// Clear memory dependency to avoid the second L1 invalidation
memoryDependency().clear();
}
@@ -2583,7 +2583,7 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
// Make sure GPU finished operation and data reached memory before the marker write
static constexpr bool FlushL2 = true;
addBarrier(FlushL2);
addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2);
// Workarounds: We had systems where an extra delay was necessary.
{
// Flush CB associated with the DGMA buffer
@@ -3029,7 +3029,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
if (!supportFineGrainedSystem) {
return false;
} else {
addBarrier();
addBarrier(RgpSqqtBarrierReason::MemDependency);
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
@@ -3126,7 +3126,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
}
//! This condition is for SVM fine-grain
if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
addBarrier();
addBarrier(RgpSqqtBarrierReason::MemDependency);
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
+3 -1
Zobrazit soubor
@@ -10,6 +10,7 @@
#include "device/pal/paltimestamp.hpp"
#include "device/pal/palsched.hpp"
#include "device/pal/paldebugger.hpp"
#include "device/pal/palgpuopen.hpp"
#include "platform/commandqueue.hpp"
#include "device/blit.hpp"
#include "palUtil.h"
@@ -445,7 +446,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Returns queue, associated with VirtualGPU
Queue& queue(EngineType id) const { return *queues_[id]; }
void addBarrier(bool flushL2 = false) const {
void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown, bool flushL2 = false) const {
Pal::BarrierInfo barrier = {};
barrier.pipePointWaitCount = 1;
Pal::HwPipePoint point = Pal::HwPipePostCs;
@@ -460,6 +461,7 @@ class VirtualGPU : public device::VirtualDevice {
Pal::LayoutShaderRead}};
barrier.pTransitions = &trans;
barrier.waitPoint = Pal::HwPipePreCs;
barrier.reason = static_cast<uint32_t>(reason);
iCmd()->CmdBarrier(barrier);
queues_[engineID_]->submit<true>(false);
}