P4 to Git Change 1579667 by gandryey@gera-w8 on 2018/07/12 12:31:33
SWDEV-155438 - Produce RGP Queue Timings chunk for OpenCL RGP files
- Collect command buffer timing
- Capture 50 dispatches by default
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#113 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#54 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#291 edit
[ROCm/clr commit: 96d0ddb728]
Dieser Commit ist enthalten in:
@@ -122,20 +122,11 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
|
||||
result = false;
|
||||
}
|
||||
|
||||
// Initialize trace resources required by each queue (and queue family)
|
||||
bool hasDebugVmid = true;
|
||||
|
||||
if (result) {
|
||||
user_event_ = new RgpSqttMarkerUserEventWithString;
|
||||
if (nullptr == user_event_) {
|
||||
result = false;
|
||||
}
|
||||
//result = InitTraceQueueResources(trace_, &hasDebugVmid);
|
||||
}
|
||||
|
||||
// If we've failed to acquire the debug VMID, fail to trace
|
||||
if (hasDebugVmid == false) {
|
||||
result = false;
|
||||
}
|
||||
|
||||
if (!result) {
|
||||
@@ -155,6 +146,59 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
// This function finds out all the queues in the device that we have to synchronize for RGP-traced
|
||||
// frames and initializes resources for them.
|
||||
bool RgpCaptureMgr::RegisterTimedQueue(VirtualGPU* gpu, bool* debug_vmid) const
|
||||
{
|
||||
bool result = true;
|
||||
|
||||
// Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
|
||||
// it may be optional for Vulkan, but we provide it anyway if available).
|
||||
Pal::KernelContextInfo kernelContextInfo = {};
|
||||
|
||||
Pal::Result palResult = gpu->queue(MainEngine).iQueue_->QueryKernelContextInfo(&kernelContextInfo);
|
||||
|
||||
// Ensure we've acquired the debug VMID (note that some platforms do not
|
||||
// implement this function, so don't fail the whole trace if so)
|
||||
*debug_vmid = kernelContextInfo.flags.hasDebugVmid;
|
||||
|
||||
// Register the queue with the GPA session class for timed queue operation support.
|
||||
if (trace_.gpa_session_->RegisterTimedQueue(gpu->queue(MainEngine).iQueue_, gpu->index(),
|
||||
kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
|
||||
result = false;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
Pal::Result RgpCaptureMgr::TimedQueueSubmit(
|
||||
Pal::IQueue* queue,
|
||||
uint64_t cmdId,
|
||||
const Pal::SubmitInfo& submitInfo) const
|
||||
{
|
||||
// Fill in extra meta-data information to associate the API command buffer data with
|
||||
// the generated timing information.
|
||||
GpuUtil::TimedSubmitInfo timedSubmitInfo = {};
|
||||
Pal::uint64 apiCmdBufIds = cmdId;
|
||||
Pal::uint32 sqttCmdBufIds = 0;
|
||||
|
||||
timedSubmitInfo.pApiCmdBufIds = &apiCmdBufIds;
|
||||
timedSubmitInfo.pSqttCmdBufIds = &sqttCmdBufIds;
|
||||
timedSubmitInfo.frameIndex = 0;
|
||||
|
||||
// Do a timed submit of all the command buffers
|
||||
Pal::Result result = trace_.gpa_session_->TimedSubmit(queue, submitInfo, timedSubmitInfo);
|
||||
|
||||
// Punt to non-timed submit if a timed submit fails (or is not supported)
|
||||
if (result != Pal::Result::Success) {
|
||||
result = queue->Submit(submitInfo);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
// Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit().
|
||||
//
|
||||
|
||||
@@ -299,8 +299,8 @@ public:
|
||||
|
||||
void Finalize();
|
||||
|
||||
void PreDispatch(VirtualGPU* pQueue, const HSAILKernel& kernel, size_t x, size_t y, size_t z);
|
||||
void PostDispatch(VirtualGPU* pQueue);
|
||||
void PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size_t x, size_t y, size_t z);
|
||||
void PostDispatch(VirtualGPU* gpu);
|
||||
|
||||
void WaitForDriverResume();
|
||||
|
||||
@@ -312,6 +312,9 @@ public:
|
||||
|
||||
void WriteBarrierStartMarker(const Pal::Developer::BarrierData& data) const;
|
||||
void WriteBarrierEndMarker(const Pal::Developer::BarrierData& data) const;
|
||||
bool RegisterTimedQueue(VirtualGPU* gpu, bool* debug_vmid) const;
|
||||
Pal::Result TimedQueueSubmit(
|
||||
Pal::IQueue* queue, uint64_t cmdId, const Pal::SubmitInfo& submitInfo) const;
|
||||
|
||||
private:
|
||||
// Steps that an RGP trace goes through
|
||||
|
||||
@@ -33,10 +33,11 @@
|
||||
|
||||
namespace pal {
|
||||
|
||||
VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueType queueType,
|
||||
VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueType queueType,
|
||||
uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
|
||||
uint rtCU, amd::CommandQueue::Priority priority,
|
||||
uint64_t residency_limit, uint max_command_buffers) {
|
||||
Pal::IDevice* palDev = gpu.dev().iDev();
|
||||
Pal::Result result;
|
||||
Pal::CmdBufferCreateInfo cmdCreateInfo = {};
|
||||
Pal::QueueCreateInfo qCreateInfo = {};
|
||||
@@ -82,7 +83,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp
|
||||
}
|
||||
|
||||
size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
|
||||
VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit, max_command_buffers);
|
||||
VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(gpu, palDev,
|
||||
residency_limit, max_command_buffers);
|
||||
if (queue != nullptr) {
|
||||
address addrQ = reinterpret_cast<address>(&queue[1]);
|
||||
// Create PAL queue object
|
||||
@@ -250,7 +252,14 @@ bool VirtualGPU::Queue::flush() {
|
||||
submitInfo.ppExternPhysMem = palSdiRefs_.data();
|
||||
|
||||
// Submit command buffer to OS
|
||||
if (Pal::Result::Success != iQueue_->Submit(submitInfo)) {
|
||||
Pal::Result result;
|
||||
if (gpu_.rgpCaptureEna()) {
|
||||
result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(
|
||||
iQueue_, cmdBufIdCurrent_, submitInfo);
|
||||
} else {
|
||||
result = iQueue_->Submit(submitInfo);
|
||||
}
|
||||
if (Pal::Result::Success != result) {
|
||||
LogError("PAL failed to submit CMD!");
|
||||
return false;
|
||||
}
|
||||
@@ -787,7 +796,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
// hwRing_ should be set 0 if forced to have single scratch buffer
|
||||
hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
|
||||
|
||||
queues_[MainEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue,
|
||||
queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue,
|
||||
cmdAllocator_, rtCUs, priority,
|
||||
residency_limit, max_cmd_buffers);
|
||||
if (nullptr == queues_[MainEngine]) {
|
||||
@@ -805,14 +814,14 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
}
|
||||
|
||||
queues_[SdmaEngine] =
|
||||
Queue::Create(dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_,
|
||||
Queue::Create(*this, Pal::QueueTypeDma, sdma, cmdAllocator_,
|
||||
amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal,
|
||||
residency_limit, max_cmd_buffers);
|
||||
if (nullptr == queues_[SdmaEngine]) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
queues_[SdmaEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute,
|
||||
queues_[SdmaEngine] = Queue::Create(*this, Pal::QueueTypeCompute,
|
||||
idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal,
|
||||
residency_limit, max_cmd_buffers);
|
||||
if (nullptr == queues_[SdmaEngine]) {
|
||||
@@ -890,7 +899,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
// If the developer mode manager is available and it's not a device queue,
|
||||
// then enable RGP capturing
|
||||
if ((index() != 0) && dev().rgpCaptureMgr() != nullptr) {
|
||||
bool dbg_vmid = false;
|
||||
state_.rgpCaptureEnabled_ = true;
|
||||
dev().rgpCaptureMgr()->RegisterTimedQueue(this, &dbg_vmid);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
@@ -50,7 +50,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
Queue(const Queue&) = delete;
|
||||
Queue& operator=(const Queue&) = delete;
|
||||
|
||||
static Queue* Create(Pal::IDevice* palDev, //!< PAL device object
|
||||
static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
|
||||
Pal::QueueType queueType, //!< PAL queue type
|
||||
uint engineIdx, //!< Select particular engine index
|
||||
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
|
||||
@@ -60,12 +60,13 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
uint max_command_buffers //!< Number of allocated command buffers
|
||||
);
|
||||
|
||||
Queue(Pal::IDevice* palDev, uint64_t residency_limit, uint max_command_buffers)
|
||||
Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers)
|
||||
: iQueue_(nullptr),
|
||||
iCmdBuffs_(max_command_buffers, nullptr),
|
||||
iCmdFences_(max_command_buffers, nullptr),
|
||||
last_kernel_(nullptr),
|
||||
iDev_(palDev),
|
||||
gpu_(gpu),
|
||||
iDev_(iDev),
|
||||
cmdBufIdSlot_(StartCmdBufIdx),
|
||||
cmdBufIdCurrent_(StartCmdBufIdx),
|
||||
cmbBufIdRetired_(0),
|
||||
@@ -156,16 +157,17 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
private:
|
||||
void DumpMemoryReferences() const;
|
||||
const VirtualGPU& gpu_; //!< OCL virtual GPU object
|
||||
Pal::IDevice* iDev_; //!< PAL device
|
||||
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
|
||||
uint cmdBufIdCurrent_; //!< Current global command buffer ID
|
||||
uint cmbBufIdRetired_; //!< The last retired command buffer ID
|
||||
uint cmdCnt_; //!< Counter of commands
|
||||
std::unordered_map<GpuMemoryReference*, uint> memReferences_;
|
||||
Util::VirtualLinearAllocator vlAlloc_;
|
||||
std::vector<Pal::GpuMemoryRef> palMemRefs_;
|
||||
std::vector<Pal::IGpuMemory*> palMems_;
|
||||
std::vector<Pal::DoppRef> palDoppRefs_;
|
||||
Util::VirtualLinearAllocator vlAlloc_;
|
||||
std::vector<Pal::GpuMemoryRef> palMemRefs_;
|
||||
std::vector<Pal::IGpuMemory*> palMems_;
|
||||
std::vector<Pal::DoppRef> palDoppRefs_;
|
||||
std::set<Pal::IGpuMemory*> sdiReferences_;
|
||||
std::vector<const Pal::IGpuMemory*> palSdiRefs_;
|
||||
uint64_t residency_size_; //!< Resource residency size
|
||||
|
||||
@@ -197,7 +197,7 @@ release(bool, GPU_VEGA10_ONLY, VEGA10_ONLY, \
|
||||
"1 = Report vega10 only on OCL/ROCR") \
|
||||
release_on_stg(bool, PAL_DISABLE_SDMA, false, \
|
||||
"1 = Disable SDMA for PAL") \
|
||||
release(uint, PAL_RGP_DISP_COUNT, 10, \
|
||||
release(uint, PAL_RGP_DISP_COUNT, 50, \
|
||||
"The number of dispatches for RGP capture with SQTT") \
|
||||
release(bool, GPU_FORCE_WAVE_SIZE_32, false, \
|
||||
"Forces WaveSize32 compilation in SC") \
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren