P4 to Git Change 1579667 by gandryey@gera-w8 on 2018/07/12 12:31:33

SWDEV-155438 - Produce RGP Queue Timings chunk for OpenCL RGP files
	- Collect command buffer timing
	- Capture 50 dispatches by default

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#113 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#54 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#291 edit
This commit is contained in:
foreman
2018-07-12 12:40:04 -04:00
parent a7cc26942f
commit 96d0ddb728
5 changed files with 85 additions and 25 deletions
+53 -9
View File
@@ -122,20 +122,11 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
result = false;
}
// Initialize trace resources required by each queue (and queue family)
bool hasDebugVmid = true;
if (result) {
user_event_ = new RgpSqttMarkerUserEventWithString;
if (nullptr == user_event_) {
result = false;
}
//result = InitTraceQueueResources(trace_, &hasDebugVmid);
}
// If we've failed to acquire the debug VMID, fail to trace
if (hasDebugVmid == false) {
result = false;
}
if (!result) {
@@ -155,6 +146,59 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
return result;
}
// ================================================================================================
// This function finds out all the queues in the device that we have to synchronize for RGP-traced
// frames and initializes resources for them.
bool RgpCaptureMgr::RegisterTimedQueue(VirtualGPU* gpu, bool* debug_vmid) const
{
bool result = true;
// Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
// it may be optional for Vulkan, but we provide it anyway if available).
Pal::KernelContextInfo kernelContextInfo = {};
Pal::Result palResult = gpu->queue(MainEngine).iQueue_->QueryKernelContextInfo(&kernelContextInfo);
// Ensure we've acquired the debug VMID (note that some platforms do not
// implement this function, so don't fail the whole trace if so)
*debug_vmid = kernelContextInfo.flags.hasDebugVmid;
// Register the queue with the GPA session class for timed queue operation support.
if (trace_.gpa_session_->RegisterTimedQueue(gpu->queue(MainEngine).iQueue_, gpu->index(),
kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
result = false;
}
return result;
}
// ================================================================================================
Pal::Result RgpCaptureMgr::TimedQueueSubmit(
Pal::IQueue* queue,
uint64_t cmdId,
const Pal::SubmitInfo& submitInfo) const
{
// Fill in extra meta-data information to associate the API command buffer data with
// the generated timing information.
GpuUtil::TimedSubmitInfo timedSubmitInfo = {};
Pal::uint64 apiCmdBufIds = cmdId;
Pal::uint32 sqttCmdBufIds = 0;
timedSubmitInfo.pApiCmdBufIds = &apiCmdBufIds;
timedSubmitInfo.pSqttCmdBufIds = &sqttCmdBufIds;
timedSubmitInfo.frameIndex = 0;
// Do a timed submit of all the command buffers
Pal::Result result = trace_.gpa_session_->TimedSubmit(queue, submitInfo, timedSubmitInfo);
// Punt to non-timed submit if a timed submit fails (or is not supported)
if (result != Pal::Result::Success) {
result = queue->Submit(submitInfo);
}
return result;
}
// ================================================================================================
// Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit().
//
+5 -2
View File
@@ -299,8 +299,8 @@ public:
void Finalize();
void PreDispatch(VirtualGPU* pQueue, const HSAILKernel& kernel, size_t x, size_t y, size_t z);
void PostDispatch(VirtualGPU* pQueue);
void PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size_t x, size_t y, size_t z);
void PostDispatch(VirtualGPU* gpu);
void WaitForDriverResume();
@@ -312,6 +312,9 @@ public:
void WriteBarrierStartMarker(const Pal::Developer::BarrierData& data) const;
void WriteBarrierEndMarker(const Pal::Developer::BarrierData& data) const;
bool RegisterTimedQueue(VirtualGPU* gpu, bool* debug_vmid) const;
Pal::Result TimedQueueSubmit(
Pal::IQueue* queue, uint64_t cmdId, const Pal::SubmitInfo& submitInfo) const;
private:
// Steps that an RGP trace goes through
+17 -6
View File
@@ -33,10 +33,11 @@
namespace pal {
VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueType queueType,
VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueType queueType,
uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
uint rtCU, amd::CommandQueue::Priority priority,
uint64_t residency_limit, uint max_command_buffers) {
Pal::IDevice* palDev = gpu.dev().iDev();
Pal::Result result;
Pal::CmdBufferCreateInfo cmdCreateInfo = {};
Pal::QueueCreateInfo qCreateInfo = {};
@@ -82,7 +83,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp
}
size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit, max_command_buffers);
VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(gpu, palDev,
residency_limit, max_command_buffers);
if (queue != nullptr) {
address addrQ = reinterpret_cast<address>(&queue[1]);
// Create PAL queue object
@@ -250,7 +252,14 @@ bool VirtualGPU::Queue::flush() {
submitInfo.ppExternPhysMem = palSdiRefs_.data();
// Submit command buffer to OS
if (Pal::Result::Success != iQueue_->Submit(submitInfo)) {
Pal::Result result;
if (gpu_.rgpCaptureEna()) {
result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(
iQueue_, cmdBufIdCurrent_, submitInfo);
} else {
result = iQueue_->Submit(submitInfo);
}
if (Pal::Result::Success != result) {
LogError("PAL failed to submit CMD!");
return false;
}
@@ -787,7 +796,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
// hwRing_ should be set 0 if forced to have single scratch buffer
hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
queues_[MainEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue,
queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue,
cmdAllocator_, rtCUs, priority,
residency_limit, max_cmd_buffers);
if (nullptr == queues_[MainEngine]) {
@@ -805,14 +814,14 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
}
queues_[SdmaEngine] =
Queue::Create(dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_,
Queue::Create(*this, Pal::QueueTypeDma, sdma, cmdAllocator_,
amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal,
residency_limit, max_cmd_buffers);
if (nullptr == queues_[SdmaEngine]) {
return false;
}
} else {
queues_[SdmaEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute,
queues_[SdmaEngine] = Queue::Create(*this, Pal::QueueTypeCompute,
idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal,
residency_limit, max_cmd_buffers);
if (nullptr == queues_[SdmaEngine]) {
@@ -890,7 +899,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
// If the developer mode manager is available and it's not a device queue,
// then enable RGP capturing
if ((index() != 0) && dev().rgpCaptureMgr() != nullptr) {
bool dbg_vmid = false;
state_.rgpCaptureEnabled_ = true;
dev().rgpCaptureMgr()->RegisterTimedQueue(this, &dbg_vmid);
}
return true;
+9 -7
View File
@@ -50,7 +50,7 @@ class VirtualGPU : public device::VirtualDevice {
Queue(const Queue&) = delete;
Queue& operator=(const Queue&) = delete;
static Queue* Create(Pal::IDevice* palDev, //!< PAL device object
static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
Pal::QueueType queueType, //!< PAL queue type
uint engineIdx, //!< Select particular engine index
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
@@ -60,12 +60,13 @@ class VirtualGPU : public device::VirtualDevice {
uint max_command_buffers //!< Number of allocated command buffers
);
Queue(Pal::IDevice* palDev, uint64_t residency_limit, uint max_command_buffers)
Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers)
: iQueue_(nullptr),
iCmdBuffs_(max_command_buffers, nullptr),
iCmdFences_(max_command_buffers, nullptr),
last_kernel_(nullptr),
iDev_(palDev),
gpu_(gpu),
iDev_(iDev),
cmdBufIdSlot_(StartCmdBufIdx),
cmdBufIdCurrent_(StartCmdBufIdx),
cmbBufIdRetired_(0),
@@ -156,16 +157,17 @@ class VirtualGPU : public device::VirtualDevice {
private:
void DumpMemoryReferences() const;
const VirtualGPU& gpu_; //!< OCL virtual GPU object
Pal::IDevice* iDev_; //!< PAL device
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
uint cmdBufIdCurrent_; //!< Current global command buffer ID
uint cmbBufIdRetired_; //!< The last retired command buffer ID
uint cmdCnt_; //!< Counter of commands
std::unordered_map<GpuMemoryReference*, uint> memReferences_;
Util::VirtualLinearAllocator vlAlloc_;
std::vector<Pal::GpuMemoryRef> palMemRefs_;
std::vector<Pal::IGpuMemory*> palMems_;
std::vector<Pal::DoppRef> palDoppRefs_;
Util::VirtualLinearAllocator vlAlloc_;
std::vector<Pal::GpuMemoryRef> palMemRefs_;
std::vector<Pal::IGpuMemory*> palMems_;
std::vector<Pal::DoppRef> palDoppRefs_;
std::set<Pal::IGpuMemory*> sdiReferences_;
std::vector<const Pal::IGpuMemory*> palSdiRefs_;
uint64_t residency_size_; //!< Resource residency size
+1 -1
View File
@@ -197,7 +197,7 @@ release(bool, GPU_VEGA10_ONLY, VEGA10_ONLY, \
"1 = Report vega10 only on OCL/ROCR") \
release_on_stg(bool, PAL_DISABLE_SDMA, false, \
"1 = Disable SDMA for PAL") \
release(uint, PAL_RGP_DISP_COUNT, 10, \
release(uint, PAL_RGP_DISP_COUNT, 50, \
"The number of dispatches for RGP capture with SQTT") \
release(bool, GPU_FORCE_WAVE_SIZE_32, false, \
"Forces WaveSize32 compilation in SC") \