From 705bd72c1a23dd7eeb18658587dd39e2d318ac9d Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 12 Jul 2018 12:40:04 -0400
Subject: [PATCH] P4 to Git Change 1579667 by gandryey@gera-w8 on 2018/07/12
12:31:33
SWDEV-155438 - Produce RGP Queue Timings chunk for OpenCL RGP files
- Collect command buffer timing
- Capture 50 dispatches by default
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#113 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#54 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#291 edit
[ROCm/clr commit: 96d0ddb7283cfba50024a437fa570ee06e4e62bb]
---
.../rocclr/runtime/device/pal/palgpuopen.cpp | 62 ++++++++++++++++---
.../rocclr/runtime/device/pal/palgpuopen.hpp | 7 ++-
.../rocclr/runtime/device/pal/palvirtual.cpp | 23 +++++--
.../rocclr/runtime/device/pal/palvirtual.hpp | 16 ++---
projects/clr/rocclr/runtime/utils/flags.hpp | 2 +-
5 files changed, 85 insertions(+), 25 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp
index 081f990dcb..0e1c21fdc6 100644
--- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp
@@ -122,20 +122,11 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
result = false;
}
- // Initialize trace resources required by each queue (and queue family)
- bool hasDebugVmid = true;
-
if (result) {
user_event_ = new RgpSqttMarkerUserEventWithString;
if (nullptr == user_event_) {
result = false;
}
- //result = InitTraceQueueResources(trace_, &hasDebugVmid);
- }
-
- // If we've failed to acquire the debug VMID, fail to trace
- if (hasDebugVmid == false) {
- result = false;
}
if (!result) {
@@ -155,6 +146,59 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
return result;
}
+// ================================================================================================
+// This function finds out all the queues in the device that we have to synchronize for RGP-traced
+// frames and initializes resources for them.
+bool RgpCaptureMgr::RegisterTimedQueue(VirtualGPU* gpu, bool* debug_vmid) const
+{
+ bool result = true;
+
+ // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
+ // it may be optional for Vulkan, but we provide it anyway if available).
+ Pal::KernelContextInfo kernelContextInfo = {};
+
+ Pal::Result palResult = gpu->queue(MainEngine).iQueue_->QueryKernelContextInfo(&kernelContextInfo);
+
+ // Ensure we've acquired the debug VMID (note that some platforms do not
+ // implement this function, so don't fail the whole trace if so)
+ *debug_vmid = kernelContextInfo.flags.hasDebugVmid;
+
+ // Register the queue with the GPA session class for timed queue operation support.
+ if (trace_.gpa_session_->RegisterTimedQueue(gpu->queue(MainEngine).iQueue_, gpu->index(),
+ kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
+ result = false;
+ }
+
+ return result;
+}
+
+// ================================================================================================
+Pal::Result RgpCaptureMgr::TimedQueueSubmit(
+ Pal::IQueue* queue,
+ uint64_t cmdId,
+ const Pal::SubmitInfo& submitInfo) const
+{
+ // Fill in extra meta-data information to associate the API command buffer data with
+ // the generated timing information.
+ GpuUtil::TimedSubmitInfo timedSubmitInfo = {};
+ Pal::uint64 apiCmdBufIds = cmdId;
+ Pal::uint32 sqttCmdBufIds = 0;
+
+ timedSubmitInfo.pApiCmdBufIds = &apiCmdBufIds;
+ timedSubmitInfo.pSqttCmdBufIds = &sqttCmdBufIds;
+ timedSubmitInfo.frameIndex = 0;
+
+ // Do a timed submit of all the command buffers
+ Pal::Result result = trace_.gpa_session_->TimedSubmit(queue, submitInfo, timedSubmitInfo);
+
+ // Punt to non-timed submit if a timed submit fails (or is not supported)
+ if (result != Pal::Result::Success) {
+ result = queue->Submit(submitInfo);
+ }
+
+ return result;
+}
+
// ================================================================================================
// Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit().
//
diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp
index 42ba2f0a22..cf1f0efec6 100644
--- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp
@@ -299,8 +299,8 @@ public:
void Finalize();
- void PreDispatch(VirtualGPU* pQueue, const HSAILKernel& kernel, size_t x, size_t y, size_t z);
- void PostDispatch(VirtualGPU* pQueue);
+ void PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size_t x, size_t y, size_t z);
+ void PostDispatch(VirtualGPU* gpu);
void WaitForDriverResume();
@@ -312,6 +312,9 @@ public:
void WriteBarrierStartMarker(const Pal::Developer::BarrierData& data) const;
void WriteBarrierEndMarker(const Pal::Developer::BarrierData& data) const;
+ bool RegisterTimedQueue(VirtualGPU* gpu, bool* debug_vmid) const;
+ Pal::Result TimedQueueSubmit(
+ Pal::IQueue* queue, uint64_t cmdId, const Pal::SubmitInfo& submitInfo) const;
private:
// Steps that an RGP trace goes through
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 7192bb74c3..9d35f0485f 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -33,10 +33,11 @@
namespace pal {
-VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueType queueType,
+VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueType queueType,
uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
uint rtCU, amd::CommandQueue::Priority priority,
uint64_t residency_limit, uint max_command_buffers) {
+ Pal::IDevice* palDev = gpu.dev().iDev();
Pal::Result result;
Pal::CmdBufferCreateInfo cmdCreateInfo = {};
Pal::QueueCreateInfo qCreateInfo = {};
@@ -82,7 +83,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp
}
size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
- VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit, max_command_buffers);
+ VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(gpu, palDev,
+ residency_limit, max_command_buffers);
if (queue != nullptr) {
address addrQ = reinterpret_cast(&queue[1]);
// Create PAL queue object
@@ -250,7 +252,14 @@ bool VirtualGPU::Queue::flush() {
submitInfo.ppExternPhysMem = palSdiRefs_.data();
// Submit command buffer to OS
- if (Pal::Result::Success != iQueue_->Submit(submitInfo)) {
+ Pal::Result result;
+ if (gpu_.rgpCaptureEna()) {
+ result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(
+ iQueue_, cmdBufIdCurrent_, submitInfo);
+ } else {
+ result = iQueue_->Submit(submitInfo);
+ }
+ if (Pal::Result::Success != result) {
LogError("PAL failed to submit CMD!");
return false;
}
@@ -787,7 +796,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
// hwRing_ should be set 0 if forced to have single scratch buffer
hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
- queues_[MainEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue,
+ queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue,
cmdAllocator_, rtCUs, priority,
residency_limit, max_cmd_buffers);
if (nullptr == queues_[MainEngine]) {
@@ -805,14 +814,14 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
}
queues_[SdmaEngine] =
- Queue::Create(dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_,
+ Queue::Create(*this, Pal::QueueTypeDma, sdma, cmdAllocator_,
amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal,
residency_limit, max_cmd_buffers);
if (nullptr == queues_[SdmaEngine]) {
return false;
}
} else {
- queues_[SdmaEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute,
+ queues_[SdmaEngine] = Queue::Create(*this, Pal::QueueTypeCompute,
idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal,
residency_limit, max_cmd_buffers);
if (nullptr == queues_[SdmaEngine]) {
@@ -890,7 +899,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
// If the developer mode manager is available and it's not a device queue,
// then enable RGP capturing
if ((index() != 0) && dev().rgpCaptureMgr() != nullptr) {
+ bool dbg_vmid = false;
state_.rgpCaptureEnabled_ = true;
+ dev().rgpCaptureMgr()->RegisterTimedQueue(this, &dbg_vmid);
}
return true;
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index fccee6d60e..221b98b36e 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -50,7 +50,7 @@ class VirtualGPU : public device::VirtualDevice {
Queue(const Queue&) = delete;
Queue& operator=(const Queue&) = delete;
- static Queue* Create(Pal::IDevice* palDev, //!< PAL device object
+ static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
Pal::QueueType queueType, //!< PAL queue type
uint engineIdx, //!< Select particular engine index
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
@@ -60,12 +60,13 @@ class VirtualGPU : public device::VirtualDevice {
uint max_command_buffers //!< Number of allocated command buffers
);
- Queue(Pal::IDevice* palDev, uint64_t residency_limit, uint max_command_buffers)
+ Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers)
: iQueue_(nullptr),
iCmdBuffs_(max_command_buffers, nullptr),
iCmdFences_(max_command_buffers, nullptr),
last_kernel_(nullptr),
- iDev_(palDev),
+ gpu_(gpu),
+ iDev_(iDev),
cmdBufIdSlot_(StartCmdBufIdx),
cmdBufIdCurrent_(StartCmdBufIdx),
cmbBufIdRetired_(0),
@@ -156,16 +157,17 @@ class VirtualGPU : public device::VirtualDevice {
private:
void DumpMemoryReferences() const;
+ const VirtualGPU& gpu_; //!< OCL virtual GPU object
Pal::IDevice* iDev_; //!< PAL device
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
uint cmdBufIdCurrent_; //!< Current global command buffer ID
uint cmbBufIdRetired_; //!< The last retired command buffer ID
uint cmdCnt_; //!< Counter of commands
std::unordered_map memReferences_;
- Util::VirtualLinearAllocator vlAlloc_;
- std::vector palMemRefs_;
- std::vector palMems_;
- std::vector palDoppRefs_;
+ Util::VirtualLinearAllocator vlAlloc_;
+ std::vector palMemRefs_;
+ std::vector palMems_;
+ std::vector palDoppRefs_;
std::set sdiReferences_;
std::vector palSdiRefs_;
uint64_t residency_size_; //!< Resource residency size
diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp
index 9c3afe475f..9ddaf715e6 100644
--- a/projects/clr/rocclr/runtime/utils/flags.hpp
+++ b/projects/clr/rocclr/runtime/utils/flags.hpp
@@ -197,7 +197,7 @@ release(bool, GPU_VEGA10_ONLY, VEGA10_ONLY, \
"1 = Report vega10 only on OCL/ROCR") \
release_on_stg(bool, PAL_DISABLE_SDMA, false, \
"1 = Disable SDMA for PAL") \
-release(uint, PAL_RGP_DISP_COUNT, 10, \
+release(uint, PAL_RGP_DISP_COUNT, 50, \
"The number of dispatches for RGP capture with SQTT") \
release(bool, GPU_FORCE_WAVE_SIZE_32, false, \
"Forces WaveSize32 compilation in SC") \