From f4e62df4cf323d755c7a699e94fd01bed0fe4b35 Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Wed, 28 Oct 2020 14:40:01 -0400 Subject: [PATCH] Add SPM support for RGP RGP protocol supports SPM collection. Enable it in the PAL backend. Change-Id: I0fa17334addad037ba6689d11fff0993f7899e66 [ROCm/clr commit: 234a94f838f524ce3251e4eab2c442a1ddfa8e2f] --- projects/clr/rocclr/device/pal/palgpuopen.cpp | 65 ++++++++++++++++++- projects/clr/rocclr/device/pal/palgpuopen.hpp | 23 +++++-- 2 files changed, 79 insertions(+), 9 deletions(-) diff --git a/projects/clr/rocclr/device/pal/palgpuopen.cpp b/projects/clr/rocclr/device/pal/palgpuopen.cpp index d05a9593d6..f175ed1c6b 100644 --- a/projects/clr/rocclr/device/pal/palgpuopen.cpp +++ b/projects/clr/rocclr/device/pal/palgpuopen.cpp @@ -51,8 +51,12 @@ RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device) max_sqtt_disp_(device_.settings().rgpSqttDispCount_), trace_gpu_mem_limit_(0), global_disp_count_(1), // Must start from 1 according to RGP spec + se_mask_(0), + perf_counter_mem_limit_(0), + perf_counter_frequency_(0), trace_enabled_(false), - inst_tracing_enabled_(false) { + inst_tracing_enabled_(false), + perf_counters_enabled_(false) { memset(&trace_, 0, sizeof(trace_)); } @@ -213,6 +217,8 @@ void RgpCaptureMgr::Finalize() { rgp_server_->DisableTraces(); } + dev_driver_server_->GetDriverControlServer()->StartLateDeviceInit(); + // Finalize the devmode manager dev_driver_server_->Finalize(); @@ -436,6 +442,51 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) { trace_gpu_mem_limit_ = traceParameters.gpuMemoryLimitInMb * 1024 * 1024; inst_tracing_enabled_ = traceParameters.flags.enableInstructionTokens; + se_mask_ = traceParameters.seMask; + + // Setup streamed performance counters + perf_counters_enabled_ = (traceParameters.flags.enableSpm != 0); + + DevDriver::RGPProtocol::ServerSpmConfig counter_config = {}; + DevDriver::Vector counters( + dev_driver_server_->GetMessageChannel()->GetAllocCb()); + rgp_server_->QuerySpmConfig(&counter_config, &counters); + + Pal::PerfExperimentProperties perf_properties = {}; + + result = gpu->dev().iDev()->GetPerfExperimentProperties(&perf_properties); + + // Querying performance properties should never fail + assert(result == Pal::Result::Success); + + perf_counter_frequency_ = counter_config.sampleFrequency; + perf_counter_mem_limit_ = counter_config.memoryLimitInMb * 1024 * 1024; + + perf_counter_ids_.clear(); + + for (size_t idx = 0; idx < counters.Size(); ++idx) { + const DevDriver::RGPProtocol::ServerSpmCounterId server_counter = counters[idx]; + const Pal::GpuBlockPerfProperties& block_perf_prop = + perf_properties.blocks[server_counter.blockId]; + + if (server_counter.instanceId == DevDriver::RGPProtocol::kSpmAllInstancesId) { + for (uint32_t instance = 0; instance < block_perf_prop.instanceCount; ++instance) { + GpuUtil::PerfCounterId counter_id = {}; + counter_id.block = static_cast(server_counter.blockId); + counter_id.instance = instance; + counter_id.eventId = server_counter.eventId; + + perf_counter_ids_.push_back(counter_id); + } + } else { + GpuUtil::PerfCounterId counter_id = {}; + counter_id.block = static_cast(server_counter.blockId); + counter_id.instance = server_counter.instanceId; + counter_id.eventId = server_counter.eventId; + + perf_counter_ids_.push_back(counter_id); + } + } // Notify the RGP server that we are starting a trace if (rgp_server_->BeginTrace() != DevDriver::Result::Success) { @@ -504,11 +555,21 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) { GpuUtil::GpaSampleConfig sampleConfig = {}; sampleConfig.type = GpuUtil::GpaSampleType::Trace; + // Configure SQTT sampleConfig.sqtt.gpuMemoryLimit = trace_gpu_mem_limit_; - sampleConfig.sqtt.seMask = 0xF; + sampleConfig.sqtt.seMask = se_mask_; + sampleConfig.sqtt.flags.enable = true; sampleConfig.sqtt.flags.supressInstructionTokens = (inst_tracing_enabled_ == false); + // Configure SPM + if (perf_counters_enabled_ && !perf_counter_ids_.empty()) { + sampleConfig.perfCounters.gpuMemoryLimit = perf_counter_mem_limit_; + sampleConfig.perfCounters.spmTraceSampleInterval = perf_counter_frequency_; + sampleConfig.perfCounters.numCounters = perf_counter_ids_.size(); + sampleConfig.perfCounters.pIds = perf_counter_ids_.data(); + } + // Fill GPU commands gpu->eventBegin(MainEngine); result = trace_.gpa_session_->BeginSample( diff --git a/projects/clr/rocclr/device/pal/palgpuopen.hpp b/projects/clr/rocclr/device/pal/palgpuopen.hpp index 6ba667c74a..a95c38c18e 100644 --- a/projects/clr/rocclr/device/pal/palgpuopen.hpp +++ b/projects/clr/rocclr/device/pal/palgpuopen.hpp @@ -59,6 +59,8 @@ enum class RgpSqqtBarrierReason : uint32_t { #ifdef PAL_GPUOPEN_OCL // gpuopen headers #include "gpuopen.h" +// gpuutil headers +#include "gpuUtil/palGpaSession.h" // PAL forward declarations namespace Pal { @@ -68,11 +70,6 @@ class IQueueSemaphore; struct PalPublicSettings; } // namespace Pal -// GpuUtil forward declarations -namespace GpuUtil { -class GpaSession; -}; - // GPUOpen forward declarations namespace DevDriver { class DevDriverServer; @@ -375,8 +372,20 @@ class RgpCaptureMgr { uint32_t trace_gpu_mem_limit_; uint32_t global_disp_count_; - bool trace_enabled_; // True if tracing is currently enabled (master flag) - bool inst_tracing_enabled_; // Enable instruction-level SQTT tokens + uint32_t se_mask_; // Shader engine mask + uint64_t perf_counter_mem_limit_; // Memory limit for perf counters + uint32_t perf_counter_frequency_; // Counter sample frequency + + std::vector perf_counter_ids_; // List of perf counter ids + + union { + struct { + uint32_t trace_enabled_ : 1; // True if tracing is currently enabled (master flag) + uint32_t inst_tracing_enabled_; // Enable instruction-level SQTT tokens + uint32_t perf_counters_enabled_; // True if perf counters are enabled + }; + uint32_t value_; + }; PAL_DISALLOW_DEFAULT_CTOR(RgpCaptureMgr); PAL_DISALLOW_COPY_AND_ASSIGN(RgpCaptureMgr);