P4 to Git Change 1619397 by gandryey@gera-w8 on 2018/10/15 18:23:10
SWDEV-155434 - Add SQTT instrumentation tokens for OpenCL dispatches for RGP support
- Add the barrier's reason to the RGP trace
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#127 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#57 edit
[ROCm/clr commit: 9bad9b4d54]
Tento commit je obsažen v:
@@ -771,7 +771,7 @@ void RgpCaptureMgr::WriteBarrierStartMarker(
|
||||
|
||||
marker.identifier = RgpSqttMarkerIdentifierBarrierStart;
|
||||
marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
|
||||
marker.dword02 = 0xFFFFFFFF; //data.reason;
|
||||
marker.dword02 = data.reason;
|
||||
marker.internal = true;
|
||||
|
||||
WriteMarker(gpu, &marker, sizeof(marker));
|
||||
|
||||
@@ -113,6 +113,17 @@ enum class RgpSqttMarkerEventType : uint32_t
|
||||
Invalid = 0xffffffff
|
||||
};
|
||||
|
||||
// ================================================================================================
|
||||
enum class RgpSqqtBarrierReason : uint32_t
|
||||
{
|
||||
Invalid = 0,
|
||||
MemDependency = 0xC0000000,
|
||||
ProfilingControl = 0xC0000001,
|
||||
SignalSubmit = 0xC0000002,
|
||||
PostDeviceEnqueue = 0xC0000003,
|
||||
Unknown = 0xffffffff
|
||||
};
|
||||
|
||||
// ================================================================================================
|
||||
// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.
|
||||
// These are generated ahead of draws or dispatches for commands that trigger generation of waves
|
||||
|
||||
@@ -443,7 +443,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
|
||||
|
||||
if (maxMemObjectsInQueue_ == 0) {
|
||||
// Flush cache
|
||||
gpu.addBarrier();
|
||||
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -484,7 +484,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
|
||||
if (flushL1Cache) {
|
||||
// Flush cache
|
||||
if (!gpu.profiling()) {
|
||||
gpu.addBarrier();
|
||||
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
|
||||
}
|
||||
// Clear memory dependency state
|
||||
const static bool All = true;
|
||||
@@ -2036,7 +2036,7 @@ void VirtualGPU::PostDeviceEnqueue(
|
||||
.runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
|
||||
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
|
||||
const static bool FlushL2 = true;
|
||||
gpuDefQueue->addBarrier(FlushL2);
|
||||
gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2);
|
||||
|
||||
// Get the address of PM4 template and add write it to params
|
||||
//! @note DMA flush must not occur between patch and the scheduler
|
||||
@@ -2272,7 +2272,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
// Note: This a workaround for incorrect results reported with release_mem packet,
|
||||
// when the packet can be processed later after this dispatch and including extra time
|
||||
if (profiling() || state_.profileEnabled_) {
|
||||
addBarrier();
|
||||
addBarrier(RgpSqqtBarrierReason::ProfilingControl);
|
||||
// Clear memory dependency to avoid the second L1 invalidation
|
||||
memoryDependency().clear();
|
||||
}
|
||||
@@ -2583,7 +2583,7 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
|
||||
|
||||
// Make sure GPU finished operation and data reached memory before the marker write
|
||||
static constexpr bool FlushL2 = true;
|
||||
addBarrier(FlushL2);
|
||||
addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2);
|
||||
// Workarounds: We had systems where an extra delay was necessary.
|
||||
{
|
||||
// Flush CB associated with the DGMA buffer
|
||||
@@ -3029,7 +3029,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
|
||||
if (!supportFineGrainedSystem) {
|
||||
return false;
|
||||
} else {
|
||||
addBarrier();
|
||||
addBarrier(RgpSqqtBarrierReason::MemDependency);
|
||||
// Clear memory dependency state
|
||||
const static bool All = true;
|
||||
memoryDependency().clear(!All);
|
||||
@@ -3126,7 +3126,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
|
||||
}
|
||||
//! This condition is for SVM fine-grain
|
||||
if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
|
||||
addBarrier();
|
||||
addBarrier(RgpSqqtBarrierReason::MemDependency);
|
||||
// Clear memory dependency state
|
||||
const static bool All = true;
|
||||
memoryDependency().clear(!All);
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include "device/pal/paltimestamp.hpp"
|
||||
#include "device/pal/palsched.hpp"
|
||||
#include "device/pal/paldebugger.hpp"
|
||||
#include "device/pal/palgpuopen.hpp"
|
||||
#include "platform/commandqueue.hpp"
|
||||
#include "device/blit.hpp"
|
||||
#include "palUtil.h"
|
||||
@@ -445,7 +446,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Returns queue, associated with VirtualGPU
|
||||
Queue& queue(EngineType id) const { return *queues_[id]; }
|
||||
|
||||
void addBarrier(bool flushL2 = false) const {
|
||||
void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown, bool flushL2 = false) const {
|
||||
Pal::BarrierInfo barrier = {};
|
||||
barrier.pipePointWaitCount = 1;
|
||||
Pal::HwPipePoint point = Pal::HwPipePostCs;
|
||||
@@ -460,6 +461,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
Pal::LayoutShaderRead}};
|
||||
barrier.pTransitions = &trans;
|
||||
barrier.waitPoint = Pal::HwPipePreCs;
|
||||
barrier.reason = static_cast<uint32_t>(reason);
|
||||
iCmd()->CmdBarrier(barrier);
|
||||
queues_[engineID_]->submit<true>(false);
|
||||
}
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele