From 705bd72c1a23dd7eeb18658587dd39e2d318ac9d Mon Sep 17 00:00:00 2001 From: foreman Date: Thu, 12 Jul 2018 12:40:04 -0400 Subject: [PATCH] P4 to Git Change 1579667 by gandryey@gera-w8 on 2018/07/12 12:31:33 SWDEV-155438 - Produce RGP Queue Timings chunk for OpenCL RGP files - Collect command buffer timing - Capture 50 dispatches by default Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#113 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#54 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#291 edit [ROCm/clr commit: 96d0ddb7283cfba50024a437fa570ee06e4e62bb] --- .../rocclr/runtime/device/pal/palgpuopen.cpp | 62 ++++++++++++++++--- .../rocclr/runtime/device/pal/palgpuopen.hpp | 7 ++- .../rocclr/runtime/device/pal/palvirtual.cpp | 23 +++++-- .../rocclr/runtime/device/pal/palvirtual.hpp | 16 ++--- projects/clr/rocclr/runtime/utils/flags.hpp | 2 +- 5 files changed, 85 insertions(+), 25 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp index 081f990dcb..0e1c21fdc6 100644 --- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.cpp @@ -122,20 +122,11 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform) result = false; } - // Initialize trace resources required by each queue (and queue family) - bool hasDebugVmid = true; - if (result) { user_event_ = new RgpSqttMarkerUserEventWithString; if (nullptr == user_event_) { result = false; } - //result = InitTraceQueueResources(trace_, &hasDebugVmid); - } - - // If we've failed to acquire the debug VMID, fail to trace - if (hasDebugVmid == false) { - result = false; } if (!result) { @@ -155,6 +146,59 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform) return result; } +// ================================================================================================ +// This function finds out all the queues in the device that we have to synchronize for RGP-traced +// frames and initializes resources for them. +bool RgpCaptureMgr::RegisterTimedQueue(VirtualGPU* gpu, bool* debug_vmid) const +{ + bool result = true; + + // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients; + // it may be optional for Vulkan, but we provide it anyway if available). + Pal::KernelContextInfo kernelContextInfo = {}; + + Pal::Result palResult = gpu->queue(MainEngine).iQueue_->QueryKernelContextInfo(&kernelContextInfo); + + // Ensure we've acquired the debug VMID (note that some platforms do not + // implement this function, so don't fail the whole trace if so) + *debug_vmid = kernelContextInfo.flags.hasDebugVmid; + + // Register the queue with the GPA session class for timed queue operation support. + if (trace_.gpa_session_->RegisterTimedQueue(gpu->queue(MainEngine).iQueue_, gpu->index(), + kernelContextInfo.contextIdentifier) != Pal::Result::Success) { + result = false; + } + + return result; +} + +// ================================================================================================ +Pal::Result RgpCaptureMgr::TimedQueueSubmit( + Pal::IQueue* queue, + uint64_t cmdId, + const Pal::SubmitInfo& submitInfo) const +{ + // Fill in extra meta-data information to associate the API command buffer data with + // the generated timing information. + GpuUtil::TimedSubmitInfo timedSubmitInfo = {}; + Pal::uint64 apiCmdBufIds = cmdId; + Pal::uint32 sqttCmdBufIds = 0; + + timedSubmitInfo.pApiCmdBufIds = &apiCmdBufIds; + timedSubmitInfo.pSqttCmdBufIds = &sqttCmdBufIds; + timedSubmitInfo.frameIndex = 0; + + // Do a timed submit of all the command buffers + Pal::Result result = trace_.gpa_session_->TimedSubmit(queue, submitInfo, timedSubmitInfo); + + // Punt to non-timed submit if a timed submit fails (or is not supported) + if (result != Pal::Result::Success) { + result = queue->Submit(submitInfo); + } + + return result; +} + // ================================================================================================ // Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit(). // diff --git a/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp b/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp index 42ba2f0a22..cf1f0efec6 100644 --- a/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palgpuopen.hpp @@ -299,8 +299,8 @@ public: void Finalize(); - void PreDispatch(VirtualGPU* pQueue, const HSAILKernel& kernel, size_t x, size_t y, size_t z); - void PostDispatch(VirtualGPU* pQueue); + void PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size_t x, size_t y, size_t z); + void PostDispatch(VirtualGPU* gpu); void WaitForDriverResume(); @@ -312,6 +312,9 @@ public: void WriteBarrierStartMarker(const Pal::Developer::BarrierData& data) const; void WriteBarrierEndMarker(const Pal::Developer::BarrierData& data) const; + bool RegisterTimedQueue(VirtualGPU* gpu, bool* debug_vmid) const; + Pal::Result TimedQueueSubmit( + Pal::IQueue* queue, uint64_t cmdId, const Pal::SubmitInfo& submitInfo) const; private: // Steps that an RGP trace goes through diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 7192bb74c3..9d35f0485f 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -33,10 +33,11 @@ namespace pal { -VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueType queueType, +VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueType queueType, uint engineIdx, Pal::ICmdAllocator* cmdAllocator, uint rtCU, amd::CommandQueue::Priority priority, uint64_t residency_limit, uint max_command_buffers) { + Pal::IDevice* palDev = gpu.dev().iDev(); Pal::Result result; Pal::CmdBufferCreateInfo cmdCreateInfo = {}; Pal::QueueCreateInfo qCreateInfo = {}; @@ -82,7 +83,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp } size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize); - VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit, max_command_buffers); + VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(gpu, palDev, + residency_limit, max_command_buffers); if (queue != nullptr) { address addrQ = reinterpret_cast
(&queue[1]); // Create PAL queue object @@ -250,7 +252,14 @@ bool VirtualGPU::Queue::flush() { submitInfo.ppExternPhysMem = palSdiRefs_.data(); // Submit command buffer to OS - if (Pal::Result::Success != iQueue_->Submit(submitInfo)) { + Pal::Result result; + if (gpu_.rgpCaptureEna()) { + result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit( + iQueue_, cmdBufIdCurrent_, submitInfo); + } else { + result = iQueue_->Submit(submitInfo); + } + if (Pal::Result::Success != result) { LogError("PAL failed to submit CMD!"); return false; } @@ -787,7 +796,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, // hwRing_ should be set 0 if forced to have single scratch buffer hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx; - queues_[MainEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue, + queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue, cmdAllocator_, rtCUs, priority, residency_limit, max_cmd_buffers); if (nullptr == queues_[MainEngine]) { @@ -805,14 +814,14 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, } queues_[SdmaEngine] = - Queue::Create(dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_, + Queue::Create(*this, Pal::QueueTypeDma, sdma, cmdAllocator_, amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers); if (nullptr == queues_[SdmaEngine]) { return false; } } else { - queues_[SdmaEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, + queues_[SdmaEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers); if (nullptr == queues_[SdmaEngine]) { @@ -890,7 +899,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, // If the developer mode manager is available and it's not a device queue, // then enable RGP capturing if ((index() != 0) && dev().rgpCaptureMgr() != nullptr) { + bool dbg_vmid = false; state_.rgpCaptureEnabled_ = true; + dev().rgpCaptureMgr()->RegisterTimedQueue(this, &dbg_vmid); } return true; diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index fccee6d60e..221b98b36e 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -50,7 +50,7 @@ class VirtualGPU : public device::VirtualDevice { Queue(const Queue&) = delete; Queue& operator=(const Queue&) = delete; - static Queue* Create(Pal::IDevice* palDev, //!< PAL device object + static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object Pal::QueueType queueType, //!< PAL queue type uint engineIdx, //!< Select particular engine index Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator @@ -60,12 +60,13 @@ class VirtualGPU : public device::VirtualDevice { uint max_command_buffers //!< Number of allocated command buffers ); - Queue(Pal::IDevice* palDev, uint64_t residency_limit, uint max_command_buffers) + Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers) : iQueue_(nullptr), iCmdBuffs_(max_command_buffers, nullptr), iCmdFences_(max_command_buffers, nullptr), last_kernel_(nullptr), - iDev_(palDev), + gpu_(gpu), + iDev_(iDev), cmdBufIdSlot_(StartCmdBufIdx), cmdBufIdCurrent_(StartCmdBufIdx), cmbBufIdRetired_(0), @@ -156,16 +157,17 @@ class VirtualGPU : public device::VirtualDevice { private: void DumpMemoryReferences() const; + const VirtualGPU& gpu_; //!< OCL virtual GPU object Pal::IDevice* iDev_; //!< PAL device uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions uint cmdBufIdCurrent_; //!< Current global command buffer ID uint cmbBufIdRetired_; //!< The last retired command buffer ID uint cmdCnt_; //!< Counter of commands std::unordered_map memReferences_; - Util::VirtualLinearAllocator vlAlloc_; - std::vector palMemRefs_; - std::vector palMems_; - std::vector palDoppRefs_; + Util::VirtualLinearAllocator vlAlloc_; + std::vector palMemRefs_; + std::vector palMems_; + std::vector palDoppRefs_; std::set sdiReferences_; std::vector palSdiRefs_; uint64_t residency_size_; //!< Resource residency size diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp index 9c3afe475f..9ddaf715e6 100644 --- a/projects/clr/rocclr/runtime/utils/flags.hpp +++ b/projects/clr/rocclr/runtime/utils/flags.hpp @@ -197,7 +197,7 @@ release(bool, GPU_VEGA10_ONLY, VEGA10_ONLY, \ "1 = Report vega10 only on OCL/ROCR") \ release_on_stg(bool, PAL_DISABLE_SDMA, false, \ "1 = Disable SDMA for PAL") \ -release(uint, PAL_RGP_DISP_COUNT, 10, \ +release(uint, PAL_RGP_DISP_COUNT, 50, \ "The number of dispatches for RGP capture with SQTT") \ release(bool, GPU_FORCE_WAVE_SIZE_32, false, \ "Forces WaveSize32 compilation in SC") \